diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1899,20 +1899,17 @@ uint64_t getClockFrequency() const override { return ClockFrequency; } /// Allocate and construct an AMDGPU kernel. - Expected - constructKernelEntry(const __tgt_offload_entry &KernelEntry, - DeviceImageTy &Image) override { + Expected + constructKernel(const __tgt_offload_entry &KernelEntry, + OMPTgtExecModeFlags ExecMode) override { + // Allocate and construct the AMDGPU kernel. + AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate(); + if (!AMDGPUKernel) + return Plugin::error("Failed to allocate memory for AMDGPU kernel"); - Expected ExecModeOrErr = - getExecutionModeForKernel(KernelEntry.name, Image); - if (!ExecModeOrErr) - return ExecModeOrErr.takeError(); + new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode); - // Allocate and initialize the AMDGPU kernel. - AMDGPUKernelTy *AMDKernel = Plugin::get().allocate(); - new (AMDKernel) AMDGPUKernelTy(KernelEntry.name, ExecModeOrErr.get()); - - return AMDKernel; + return *AMDGPUKernel; } /// Set the current context to this device's context. Do nothing since the diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -794,9 +794,9 @@ __tgt_offload_entry &DeviceEntry); /// Allocate and construct a kernel object. - virtual Expected - constructKernelEntry(const __tgt_offload_entry &KernelEntry, - DeviceImageTy &Image) = 0; + virtual Expected + constructKernel(const __tgt_offload_entry &KernelEntry, + OMPTgtExecModeFlags ExecMode) = 0; /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output @@ -837,8 +837,8 @@ protected: /// Return the execution mode used for kernel \p Name. - Expected getExecutionModeForKernel(StringRef Name, - DeviceImageTy &Image); + virtual Expected + getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image); /// Environment variables defined by the LLVM OpenMP implementation /// regarding the initial number of streams and events. diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -746,21 +746,25 @@ __tgt_offload_entry &DeviceEntry) { DeviceEntry = KernelEntry; + // Retrieve the execution mode. + auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image); + if (!ExecModeOrErr) + return ExecModeOrErr.takeError(); + // Create a kernel object. - auto KernelOrErr = constructKernelEntry(KernelEntry, Image); + auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr); if (!KernelOrErr) return KernelOrErr.takeError(); - GenericKernelTy *Kernel = *KernelOrErr; - assert(Kernel != nullptr && "Invalid kernel"); + GenericKernelTy &Kernel = *KernelOrErr; // Initialize the kernel. - if (auto Err = Kernel->init(*this, Image)) + if (auto Err = Kernel.init(*this, Image)) return Err; // Set the device entry address to the kernel address and store the entry on // the entry table. - DeviceEntry.addr = (void *)Kernel; + DeviceEntry.addr = (void *)&Kernel; Image.getOffloadEntryTable().addEntry(DeviceEntry); return Plugin::success(); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -37,31 +37,80 @@ struct CUDADeviceTy; struct CUDAPluginTy; +/// Class implementing the CUDA device images properties. +struct CUDADeviceImageTy : public DeviceImageTy { + /// Create the CUDA image with the id and the target image pointer. + CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} + + /// Load the image as a CUDA module. + Error loadModule() { + assert(!Module && "Module already loaded"); + + CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr); + if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s")) + return Err; + + return Plugin::success(); + } + + /// Unload the CUDA module corresponding to the image. + Error unloadModule() { + assert(Module && "Module not loaded"); + + CUresult Res = cuModuleUnload(Module); + if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s")) + return Err; + + Module = nullptr; + + return Plugin::success(); + } + + /// Getter of the CUDA module. + CUmodule getModule() const { return Module; } + +private: + /// The CUDA module that loaded the image. + CUmodule Module; +}; + /// Class implementing the CUDA kernel functionalities which derives from the /// generic kernel class. struct CUDAKernelTy : public GenericKernelTy { - /// Create a CUDA kernel with a name, an execution mode, and the kernel - /// function. - CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, - CUfunction Func) - : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + /// Create a CUDA kernel with a name and an execution mode. + CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode) + : GenericKernelTy(Name, ExecMode), Func(nullptr) {} - /// Initialize the CUDA kernel + /// Initialize the CUDA kernel. Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override { + CUresult Res; + CUDADeviceImageTy &CUDAImage = static_cast(Image); + + // Retrieve the function pointer of the kernel. + Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName()); + if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s", + getName())) + return Err; + + // Check that the function pointer is valid. + if (!Func) + return Plugin::error("Invalid function for kernel %s", getName()); + int MaxThreads; - CUresult Res = cuFuncGetAttribute( - &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); + Res = cuFuncGetAttribute(&MaxThreads, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) return Err; - /// Set the maximum number of threads for the CUDA kernel. + // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); return Plugin::success(); } - /// Launch the CUDA kernel function + /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; @@ -165,44 +214,6 @@ HandleTy Event; }; -/// Class implementing the CUDA device images properties. -struct CUDADeviceImageTy : public DeviceImageTy { - /// Create the CUDA image with the id and the target image pointer. - CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) - : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} - - /// Load the image as a CUDA module. - Error loadModule() { - assert(!Module && "Module already loaded"); - - CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr); - if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s")) - return Err; - - return Plugin::success(); - } - - /// Unload the CUDA module corresponding to the image. - Error unloadModule() { - assert(Module && "Module not loaded"); - - CUresult Res = cuModuleUnload(Module); - if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s")) - return Err; - - Module = nullptr; - - return Plugin::success(); - } - - /// Getter of the CUDA module. - CUmodule getModule() const { return Module; } - -private: - /// The CUDA module that loaded the image. - CUmodule Module; -}; - /// Class implementing the CUDA device functionalities which derives from the /// generic device class. struct CUDADeviceTy : public GenericDeviceTy { @@ -330,32 +341,17 @@ } /// Allocate and construct a CUDA kernel. - Expected - constructKernelEntry(const __tgt_offload_entry &KernelEntry, - DeviceImageTy &Image) override { - CUDADeviceImageTy &CUDAImage = static_cast(Image); - - // Retrieve the function pointer of the kernel. - CUfunction Func; - CUresult Res = - cuModuleGetFunction(&Func, CUDAImage.getModule(), KernelEntry.name); - if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s", - KernelEntry.name)) - return std::move(Err); - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry), - KernelEntry.name, DPxPTR(Func)); - - Expected ExecModeOrErr = - getExecutionModeForKernel(KernelEntry.name, Image); - if (!ExecModeOrErr) - return ExecModeOrErr.takeError(); - - // Allocate and initialize the CUDA kernel. + Expected + constructKernel(const __tgt_offload_entry &KernelEntry, + OMPTgtExecModeFlags ExecMode) override { + // Allocate and construct the CUDA kernel. CUDAKernelTy *CUDAKernel = Plugin::get().allocate(); - new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecModeOrErr.get(), Func); + if (!CUDAKernel) + return Plugin::error("Failed to allocate memory for CUDA kernel"); + + new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode); - return CUDAKernel; + return *CUDAKernel; } /// Set the current context to this device's context. diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -49,14 +49,27 @@ /// Class implementing kernel functionalities for GenELF64. struct GenELF64KernelTy : public GenericKernelTy { - /// Construct the kernel with a name, execution mode and a function. - GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, - void (*Func)(void)) - : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + /// Construct the kernel with a name and an execution mode. + GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode) + : GenericKernelTy(Name, ExecMode), Func(nullptr) {} /// Initialize the kernel. - Error initImpl(GenericDeviceTy &GenericDevice, - DeviceImageTy &Image) override { + Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { + // Functions have zero size. + GlobalTy Global(getName(), 0); + + // Get the metadata (address) of the kernel function. + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, Global)) + return Err; + + // Check that the function pointer is valid. + if (!Global.getPtr()) + return Plugin::error("Invalid function for kernel %s", getName()); + + // Save the function pointer. + Func = (void (*)())Global.getPtr(); + // Set the maximum number of threads to a single. MaxNumThreads = 1; return Plugin::success(); @@ -119,23 +132,18 @@ Error deinitImpl() override { return Plugin::success(); } /// Construct the kernel for a specific image on the device. - Expected - constructKernelEntry(const __tgt_offload_entry &KernelEntry, - DeviceImageTy &Image) override { - GlobalTy Func(KernelEntry); - - // Get the metadata (address) of the kernel function. - GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); - if (auto Err = GHandler.getGlobalMetadataFromDevice(*this, Image, Func)) - return std::move(Err); - - // Allocate and create the kernel. + Expected + constructKernel(const __tgt_offload_entry &KernelEntry, + OMPTgtExecModeFlags ExecMode) override { + // Allocate and construct the kernel. GenELF64KernelTy *GenELF64Kernel = Plugin::get().allocate(); - new (GenELF64Kernel) GenELF64KernelTy( - KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)())Func.getPtr()); + if (!GenELF64Kernel) + return Plugin::error("Failed to allocate memory for GenELF64 kernel"); - return GenELF64Kernel; + new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode); + + return *GenELF64Kernel; } /// Set the current context to this device, which is a no-op. @@ -312,6 +320,13 @@ } Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } +protected: + /// Retrieve the execution mode for kernels. All kernels use the generic mode. + Expected + getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override { + return OMP_TGT_EXEC_MODE_GENERIC; + } + private: /// Grid values for Generic ELF64 plugins. static constexpr GV GenELF64GridValues = {