Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- libomptarget/plugins/cuda/src/rtl.cpp +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -60,11 +60,25 @@ std::vector<__tgt_offload_entry> Entries; }; -enum ExecutionModeType { - SPMD, // constructors, destructors, - // combined constructs (`teams distribute parallel for [simd]`) - GENERIC, // everything else - NONE +enum ExecutionModeType : int8_t { + SPMD = 0, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + Generic = 1, // everything else + None = 2 +}; + +// Each target kernel is described by the following data structure type code +// which is generated by the compiler and contains the kernel's computation +// properties. +struct TargetKernelCompProperties { + int8_t ExecutionMode; + int32_t NumReductionVars; + int32_t ReductionVarsSize; + + TargetKernelCompProperties(int8_t _ExecutionMode, int32_t _NumReductionVars, + int32_t _ReductionVarsSize) : ExecutionMode(_ExecutionMode), + NumReductionVars(_NumReductionVars), ReductionVarsSize(_ReductionVarsSize) + {} }; /// Use a single entity to encode a kernel and a set of flags @@ -76,8 +90,15 @@ // 1 - Generic mode (with master warp) int8_t ExecutionMode; - KernelTy(CUfunction _Func, int8_t _ExecutionMode) - : Func(_Func), ExecutionMode(_ExecutionMode) {} + // Number of reduction variables + int32_t NumReductionVars; + + // Total size of reduction variables + int32_t ReductionVarsSize; + + KernelTy(CUfunction _Func, TargetKernelCompProperties _CP) : Func(_Func), + ExecutionMode(_CP.ExecutionMode), NumReductionVars(_CP.NumReductionVars), + ReductionVarsSize(_CP.ReductionVarsSize) {} }; /// List that contains all the kernels. @@ -442,44 +463,49 @@ DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); - // default value GENERIC (in case symbol is missing from cubin file) - int8_t ExecModeVal = ExecutionModeType::GENERIC; - std::string ExecModeNameStr (e->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); + // Load the kernel's computation properties + // Default values (in case symbol is missing from cubin file): + // Generic, 0, 0 + struct TargetKernelCompProperties CP(ExecutionModeType::Generic, 0, 0); - CUdeviceptr ExecModePtr; + std::string CPNameStr (e->name); + CPNameStr += "_property"; + const char *CPName = CPNameStr.c_str(); + + CUdeviceptr CPPtr; size_t cusize; - err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); + err = cuModuleGetGlobal(&CPPtr, &cusize, cumod, CPName); if (err == CUDA_SUCCESS) { - if ((size_t)cusize != sizeof(int8_t)) { - DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, cusize, sizeof(int8_t)); + if ((size_t)cusize != sizeof(TargetKernelCompProperties)) { + DP("Loading global target kernel computation properties '%s' - size " + "mismatch (%zd != %zd)\n", CPName, cusize, + sizeof(TargetKernelCompProperties)); CUDA_ERR_STRING(err); return NULL; } - err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); + err = cuMemcpyDtoH(&CP, CPPtr, cusize); if (err != CUDA_SUCCESS) { DP("Error when copying data from device to host. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", - DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&CP), DPxPTR(CPPtr), cusize); CUDA_ERR_STRING(err); return NULL; } - if (ExecModeVal < 0 || ExecModeVal > 1) { - DP("Error wrong exec_mode value specified in cubin file: %d\n", - ExecModeVal); + if (CP.ExecutionMode < SPMD || CP.ExecutionMode >= None) { + DP("Error wrong target kernel computation properties value specified in" + " cubin file: %d\n", CP.ExecutionMode); return NULL; } } else { - DP("Loading global exec_mode '%s' - symbol missing, using default value " - "GENERIC (1)\n", ExecModeName); + DP("Loading global target kernel computation properties '%s' - symbol " + "missing, using default values: ExecutionMode Generic (1), " + "NumReductionVars 0, ReductionVarsSize 0\n", CPName); CUDA_ERR_STRING(err); } - KernelsList.push_back(KernelTy(fun, ExecModeVal)); + KernelsList.push_back(KernelTy(fun, CP)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); @@ -586,7 +612,8 @@ } // All args are references. - std::vector args(arg_num); + // Allocate one more pointer for the reduction scratchpad. + std::vector args(arg_num + 1); std::vector ptrs(arg_num); for (int32_t i = 0; i < arg_num; ++i) { @@ -602,7 +629,7 @@ cudaThreadsPerBlock = thread_limit; DP("Setting CUDA threads per block to requested %d\n", thread_limit); // Add master warp if necessary - if (KernelInfo->ExecutionMode == GENERIC) { + if (KernelInfo->ExecutionMode == Generic) { cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); } @@ -668,6 +695,23 @@ DP("Using requested number of teams %d\n", team_num); } + void *Scratchpad = NULL; + size_t ScratchpadSize = KernelInfo->NumReductionVars == 0 ? 0 : + 256 /*space for timestamp*/ + + cudaBlocksPerGrid * KernelInfo->ReductionVarsSize + + KernelInfo->NumReductionVars * /*padding=*/256; + if (ScratchpadSize > 0) { + Scratchpad = __tgt_rtl_data_alloc(device_id, ScratchpadSize, Scratchpad); + if (Scratchpad == NULL) { + DP("Failed to allocate reduction scratchpad\n"); + return OFFLOAD_FAIL; + } + + unsigned timestamp = 0; + __tgt_rtl_data_submit(device_id, Scratchpad, ×tamp, sizeof(unsigned)); + } + args[arg_num] = &Scratchpad; + // Run on the device. DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, cudaThreadsPerBlock); @@ -692,6 +736,9 @@ DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); } + if (Scratchpad) + __tgt_rtl_data_delete(device_id, Scratchpad); + return OFFLOAD_SUCCESS; }