Index: libomptarget/plugins/cuda/src/rtl.cpp
===================================================================
--- libomptarget/plugins/cuda/src/rtl.cpp
+++ libomptarget/plugins/cuda/src/rtl.cpp
@@ -60,11 +60,25 @@
   std::vector<__tgt_offload_entry> Entries;
 };
 
-enum ExecutionModeType {
-  SPMD, // constructors, destructors,
-        // combined constructs (`teams distribute parallel for [simd]`)
-  GENERIC, // everything else
-  NONE
+enum ExecutionModeType : int8_t {
+  SPMD    = 0, // constructors, destructors,
+               // combined constructs (`teams distribute parallel for [simd]`)
+  Generic = 1, // everything else
+  None    = 2
+};
+
+// Each target kernel is described by the following data structure type code
+// which is generated by the compiler and contains the kernel's computation
+// properties.
+struct TargetKernelCompProperties {
+  int8_t ExecutionMode;
+  int32_t NumReductionVars;
+  int32_t ReductionVarsSize;
+
+  TargetKernelCompProperties(int8_t _ExecutionMode, int32_t _NumReductionVars,
+      int32_t _ReductionVarsSize) : ExecutionMode(_ExecutionMode),
+      NumReductionVars(_NumReductionVars), ReductionVarsSize(_ReductionVarsSize)
+  {}
 };
 
 /// Use a single entity to encode a kernel and a set of flags
@@ -76,8 +90,15 @@
   // 1 - Generic mode (with master warp)
   int8_t ExecutionMode;
 
-  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
-      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+  // Number of reduction variables
+  int32_t NumReductionVars;
+
+  // Total size of reduction variables
+  int32_t ReductionVarsSize;
+
+  KernelTy(CUfunction _Func, TargetKernelCompProperties _CP) : Func(_Func),
+      ExecutionMode(_CP.ExecutionMode), NumReductionVars(_CP.NumReductionVars),
+      ReductionVarsSize(_CP.ReductionVarsSize) {}
 };
 
 /// List that contains all the kernels.
@@ -442,44 +463,49 @@
     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
 
-    // default value GENERIC (in case symbol is missing from cubin file)
-    int8_t ExecModeVal = ExecutionModeType::GENERIC;
-    std::string ExecModeNameStr (e->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
+    // Load the kernel's computation properties
+    // Default values (in case symbol is missing from cubin file):
+    // Generic, 0, 0
+    struct TargetKernelCompProperties CP(ExecutionModeType::Generic, 0, 0);
 
-    CUdeviceptr ExecModePtr;
+    std::string CPNameStr (e->name);
+    CPNameStr += "_property";
+    const char *CPName = CPNameStr.c_str();
+
+    CUdeviceptr CPPtr;
     size_t cusize;
-    err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
+    err = cuModuleGetGlobal(&CPPtr, &cusize, cumod, CPName);
     if (err == CUDA_SUCCESS) {
-      if ((size_t)cusize != sizeof(int8_t)) {
-        DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-           ExecModeName, cusize, sizeof(int8_t));
+      if ((size_t)cusize != sizeof(TargetKernelCompProperties)) {
+        DP("Loading global target kernel computation properties '%s' - size "
+            "mismatch (%zd != %zd)\n", CPName, cusize,
+            sizeof(TargetKernelCompProperties));
         CUDA_ERR_STRING(err);
         return NULL;
       }
 
-      err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
+      err = cuMemcpyDtoH(&CP, CPPtr, cusize);
       if (err != CUDA_SUCCESS) {
         DP("Error when copying data from device to host. Pointers: "
-           "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-           DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
+            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+            DPxPTR(&CP), DPxPTR(CPPtr), cusize);
         CUDA_ERR_STRING(err);
         return NULL;
       }
 
-      if (ExecModeVal < 0 || ExecModeVal > 1) {
-        DP("Error wrong exec_mode value specified in cubin file: %d\n",
-           ExecModeVal);
+      if (CP.ExecutionMode < SPMD || CP.ExecutionMode >= None) {
+        DP("Error wrong target kernel computation properties value specified in"
+            " cubin file: %d\n", CP.ExecutionMode);
         return NULL;
       }
     } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default value "
-          "GENERIC (1)\n", ExecModeName);
+      DP("Loading global target kernel computation properties '%s' - symbol "
+          "missing, using default values: ExecutionMode Generic (1), "
+          "NumReductionVars 0, ReductionVarsSize 0\n", CPName);
       CUDA_ERR_STRING(err);
     }
 
-    KernelsList.push_back(KernelTy(fun, ExecModeVal));
+    KernelsList.push_back(KernelTy(fun, CP));
 
     __tgt_offload_entry entry = *e;
     entry.addr = (void *)&KernelsList.back();
@@ -586,7 +612,8 @@
   }
 
   // All args are references.
-  std::vector<void *> args(arg_num);
+  // Allocate one more pointer for the reduction scratchpad.
+  std::vector<void *> args(arg_num + 1);
   std::vector<void *> ptrs(arg_num);
 
   for (int32_t i = 0; i < arg_num; ++i) {
@@ -602,7 +629,7 @@
     cudaThreadsPerBlock = thread_limit;
     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
     // Add master warp if necessary
-    if (KernelInfo->ExecutionMode == GENERIC) {
+    if (KernelInfo->ExecutionMode == Generic) {
       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
     }
@@ -668,6 +695,23 @@
     DP("Using requested number of teams %d\n", team_num);
   }
 
+  void *Scratchpad = NULL;
+  size_t ScratchpadSize = KernelInfo->NumReductionVars == 0 ? 0 :
+      256 /*space for timestamp*/ +
+      cudaBlocksPerGrid * KernelInfo->ReductionVarsSize +
+      KernelInfo->NumReductionVars * /*padding=*/256;
+  if (ScratchpadSize > 0) {
+    Scratchpad = __tgt_rtl_data_alloc(device_id, ScratchpadSize, Scratchpad);
+    if (Scratchpad == NULL) {
+      DP("Failed to allocate reduction scratchpad\n");
+      return OFFLOAD_FAIL;
+    }
+
+    unsigned timestamp = 0;
+    __tgt_rtl_data_submit(device_id, Scratchpad, &timestamp, sizeof(unsigned));
+  }
+  args[arg_num] = &Scratchpad;
+
   // Run on the device.
   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
      cudaThreadsPerBlock);
@@ -692,6 +736,9 @@
     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
   }
 
+  if (Scratchpad)
+    __tgt_rtl_data_delete(device_id, Scratchpad);
+
   return OFFLOAD_SUCCESS;
 }