Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1530,7 +1530,7 @@ char GPUName[64]; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) return Err; - Arch = GPUName; + ComputeUnitKind = GPUName; // Get the wavefront size. uint32_t WavefrontSize = 0; @@ -1669,7 +1669,7 @@ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str()); - std::string MCPU = "-plugin-opt=mcpu=" + getArch(); + std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); StringRef Args[] = {LLDPath, "-flavor", @@ -1692,7 +1692,8 @@ MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath.data()).get()); } - std::string getArch() const override { return Arch; } + /// See GenericDeviceTy::getComputeUnitKind(). + std::string getComputeUnitKind() const override { return ComputeUnitKind; } /// Allocate and construct an AMDGPU kernel. Expected @@ -2096,7 +2097,7 @@ hsa_agent_t Agent; /// The GPU architecture. - std::string Arch; + std::string ComputeUnitKind; /// Reference to the host device. AMDHostDeviceTy &HostDevice; Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.h @@ -11,12 +11,19 @@ #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H +#include "Utilities.h" + +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Error.h" +#include "llvm/Target/TargetMachine.h" #include #include +#include #include struct __tgt_device_image; @@ -25,25 +32,83 @@ class MemoryBuffer; namespace omp { -namespace jit { - -/// Function type for a callback that will be called after the backend is -/// called. -using PostProcessingFn = std::function>( - std::unique_ptr)>; - -/// Check if \p Image contains bitcode with triple \p Triple. -bool checkBitcodeImage(__tgt_device_image *Image, Triple::ArchType TA); - -/// Compile the bitcode image \p Image and generate the binary image that can be -/// loaded to the target device of the triple \p Triple architecture \p MCpu. \p -/// PostProcessing will be called after codegen to handle cases such as assember -/// as an external tool. -Expected<__tgt_device_image *> compile(__tgt_device_image *Image, - Triple::ArchType TA, std::string MCpu, - unsigned OptLevel, - PostProcessingFn PostProcessing); -} // namespace jit +namespace target { +namespace plugin { +struct GenericDeviceTy; +} // namespace plugin + +/// The JIT infrastructure and caching mechanism. +struct JITEngine { + /// Function type for a callback that will be called after the backend is + /// called. + using PostProcessingFn = + std::function>( + std::unique_ptr)>; + + JITEngine(Triple::ArchType TA); + + /// Run jit compilation if \p Image is a bitcode image, otherwise simply + /// return \p Image. It is expected to return a memory buffer containing the + /// generated device image that could be loaded to the device directly. + Expected<__tgt_device_image *> + process(__tgt_device_image &Image, target::plugin::GenericDeviceTy &Device); + + /// Return true if \p Image is a bitcode image that can be JITed for the given + /// architecture. + bool checkBitcodeImage(__tgt_device_image &Image); + +private: + /// Compile the bitcode image \p Image and generate the binary image that can + /// be loaded to the target device of the triple \p Triple architecture \p + /// MCpu. \p PostProcessing will be called after codegen to handle cases such + /// as assember as an external tool. + Expected<__tgt_device_image *> compile(__tgt_device_image &Image, + const std::string &ComputeUnitKind, + PostProcessingFn PostProcessing); + + /// Run backend, which contains optimization and code generation. + Expected> + backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel); + + /// Run optimization pipeline. + void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, + unsigned OptLevel); + + /// Run code generation. + void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, + raw_pwrite_stream &OS); + + /// The target triple used by the JIT. + const Triple TT; + + struct CPUImageInfo { + /// LLVM Context in which the modules will be constructed. + LLVMContext Context; + + /// Output images generated from LLVM backend. + SmallVector, 4> JITImages; + + /// A map of embedded IR images to JITed images. + DenseMap<__tgt_device_image *, __tgt_device_image *> TgtImageMap; + }; + + /// Map from "CPUs" (e.g., sm_80, or gfx90a) to the image information we + /// cached for them. + StringMap CPUImageMap; + std::mutex CPUImageMapMutex; + + /// Control environment variables. + target::StringEnvar ReplacementModuleFileName = + target::StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_MODULE"); + target::StringEnvar PreOptIRModuleFileName = + target::StringEnvar("LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE"); + target::StringEnvar PostOptIRModuleFileName = + target::StringEnvar("LIBOMPTARGET_JIT_POST_OPT_IR_MODULE"); + target::UInt32Envar JITOptLevel = + target::UInt32Envar("LIBOMPTARGET_JIT_OPT_LEVEL", 3); +}; + +} // namespace target } // namespace omp } // namespace llvm Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/JIT.cpp @@ -11,11 +11,11 @@ #include "JIT.h" #include "Debug.h" +#include "PluginInterface.h" #include "Utilities.h" #include "omptarget.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/CommandFlags.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/LLVMContext.h" @@ -28,7 +28,6 @@ #include "llvm/Object/IRObjectFile.h" #include "llvm/Passes/OptimizationLevel.h" #include "llvm/Passes/PassBuilder.h" -#include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" @@ -39,15 +38,23 @@ #include "llvm/Target/TargetOptions.h" #include +#include #include using namespace llvm; using namespace llvm::object; using namespace omp; +using namespace omp::target; static codegen::RegisterCodeGenFlags RCGF; namespace { + +/// A map from a bitcode image start address to its corresponding triple. If the +/// image is not in the map, it is not a bitcode image. +DenseMap BitcodeImageMap; +std::shared_mutex BitcodeImageMapMutex; + std::once_flag InitFlag; void init(Triple TT) { @@ -125,9 +132,9 @@ return Mod; } Expected> -createModuleFromImage(__tgt_device_image *Image, LLVMContext &Context) { - StringRef Data((const char *)Image->ImageStart, - (char *)Image->ImageEnd - (char *)Image->ImageStart); +createModuleFromImage(__tgt_device_image &Image, LLVMContext &Context) { + StringRef Data((const char *)Image.ImageStart, + (char *)Image.ImageEnd - (char *)Image.ImageStart); std::unique_ptr MB = MemoryBuffer::getMemBuffer( Data, /* BufferName */ "", /* RequiresNullTerminator */ false); return createModuleFromMemoryBuffer(MB, Context); @@ -192,44 +199,11 @@ return TM; } -/// -class JITEngine { -public: - JITEngine(Triple::ArchType TA, std::string MCpu) - : TT(Triple::getArchTypeName(TA)), CPU(MCpu), - ReplacementModuleFileName("LIBOMPTARGET_JIT_REPLACEMENT_MODULE"), - PreOptIRModuleFileName("LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE"), - PostOptIRModuleFileName("LIBOMPTARGET_JIT_POST_OPT_IR_MODULE") { - std::call_once(InitFlag, init, TT); - } - - /// Run jit compilation. It is expected to get a memory buffer containing the - /// generated device image that could be loaded to the device directly. - Expected> - run(__tgt_device_image *Image, unsigned OptLevel, - jit::PostProcessingFn PostProcessing); - -private: - /// Run backend, which contains optimization and code generation. - Expected> backend(Module &M, unsigned OptLevel); - - /// Run optimization pipeline. - void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, - unsigned OptLevel); - - /// Run code generation. - void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, - raw_pwrite_stream &OS); - - LLVMContext Context; - const Triple TT; - const std::string CPU; +} // namespace - /// Control environment variables. - target::StringEnvar ReplacementModuleFileName; - target::StringEnvar PreOptIRModuleFileName; - target::StringEnvar PostOptIRModuleFileName; -}; +JITEngine::JITEngine(Triple::ArchType TA) : TT(Triple::getArchTypeName(TA)) { + std::call_once(InitFlag, init, TT); +} void JITEngine::opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M, unsigned OptLevel) { @@ -274,18 +248,19 @@ PM.run(M); } -Expected> JITEngine::backend(Module &M, - unsigned OptLevel) { +Expected> +JITEngine::backend(Module &M, const std::string &ComputeUnitKind, + unsigned OptLevel) { auto RemarksFileOrErr = setupLLVMOptimizationRemarks( - Context, /* RemarksFilename */ "", /* RemarksPasses */ "", + M.getContext(), /* RemarksFilename */ "", /* RemarksPasses */ "", /* RemarksFormat */ "", /* RemarksWithHotness */ false); if (Error E = RemarksFileOrErr.takeError()) return std::move(E); if (*RemarksFileOrErr) (*RemarksFileOrErr)->keep(); - auto TMOrErr = createTargetMachine(M, CPU, OptLevel); + auto TMOrErr = createTargetMachine(M, ComputeUnitKind, OptLevel); if (!TMOrErr) return TMOrErr.takeError(); @@ -315,14 +290,23 @@ return MemoryBuffer::getMemBufferCopy(OS.str()); } -Expected> -JITEngine::run(__tgt_device_image *Image, unsigned OptLevel, - jit::PostProcessingFn PostProcessing) { +Expected<__tgt_device_image *> +JITEngine::compile(__tgt_device_image &Image, + const std::string &ComputeUnitKind, + PostProcessingFn PostProcessing) { + std::lock_guard Lock(CPUImageMapMutex); + + // Check if we JITed this image for the given compute unit kind before. + CPUImageInfo &CPUII = CPUImageMap[ComputeUnitKind]; + if (__tgt_device_image *JITedImage = CPUII.TgtImageMap.lookup(&Image)) + return JITedImage; + Module *Mod = nullptr; // Check if the user replaces the module at runtime or we read it from the // image. + // TODO: Allow the user to specify images per device (Arch + ComputeUnitKind). if (!ReplacementModuleFileName.isPresent()) { - auto ModOrErr = createModuleFromImage(Image, Context); + auto ModOrErr = createModuleFromImage(Image, CPUII.Context); if (!ModOrErr) return ModOrErr.takeError(); Mod = ModOrErr->release(); @@ -333,45 +317,66 @@ return createStringError(MBOrErr.getError(), "Could not read replacement module from %s\n", ReplacementModuleFileName.get().c_str()); - auto ModOrErr = createModuleFromMemoryBuffer(MBOrErr.get(), Context); + auto ModOrErr = createModuleFromMemoryBuffer(MBOrErr.get(), CPUII.Context); if (!ModOrErr) return ModOrErr.takeError(); Mod = ModOrErr->release(); } - auto MBOrError = backend(*Mod, OptLevel); + auto MBOrError = backend(*Mod, ComputeUnitKind, JITOptLevel); if (!MBOrError) return MBOrError.takeError(); - return PostProcessing(std::move(*MBOrError)); + auto ImageMBOrErr = PostProcessing(std::move(*MBOrError)); + if (!ImageMBOrErr) + return ImageMBOrErr.takeError(); + + CPUII.JITImages.push_back(std::move(*ImageMBOrErr)); + __tgt_device_image *&JITedImage = CPUII.TgtImageMap[&Image]; + JITedImage = new __tgt_device_image(); + *JITedImage = Image; + + auto &ImageMB = CPUII.JITImages.back(); + + JITedImage->ImageStart = (void *)ImageMB->getBufferStart(); + JITedImage->ImageEnd = (void *)ImageMB->getBufferEnd(); + + return JITedImage; } -/// A map from a bitcode image start address to its corresponding triple. If the -/// image is not in the map, it is not a bitcode image. -DenseMap BitcodeImageMap; +Expected<__tgt_device_image *> +JITEngine::process(__tgt_device_image &Image, + target::plugin::GenericDeviceTy &Device) { + const std::string &ComputeUnitKind = Device.getComputeUnitKind(); -/// Output images generated from LLVM backend. -SmallVector, 4> JITImages; + PostProcessingFn PostProcessing = [&Device](std::unique_ptr MB) + -> Expected> { + return Device.doJITPostProcessing(std::move(MB)); + }; -/// A list of __tgt_device_image images. -std::list<__tgt_device_image> TgtImages; -} // namespace + { + std::shared_lock SharedLock(BitcodeImageMapMutex); + auto Itr = BitcodeImageMap.find(Image.ImageStart); + if (Itr != BitcodeImageMap.end() && Itr->second == TT.getArch()) + return compile(Image, ComputeUnitKind, PostProcessing); + } + + return &Image; +} -namespace llvm { -namespace omp { -namespace jit { -bool checkBitcodeImage(__tgt_device_image *Image, Triple::ArchType TA) { +bool JITEngine::checkBitcodeImage(__tgt_device_image &Image) { TimeTraceScope TimeScope("Check bitcode image"); + std::lock_guard Lock(BitcodeImageMapMutex); { - auto Itr = BitcodeImageMap.find(Image->ImageStart); - if (Itr != BitcodeImageMap.end() && Itr->second == TA) + auto Itr = BitcodeImageMap.find(Image.ImageStart); + if (Itr != BitcodeImageMap.end() && Itr->second == TT.getArch()) return true; } - StringRef Data(reinterpret_cast(Image->ImageStart), - reinterpret_cast(Image->ImageEnd) - - reinterpret_cast(Image->ImageStart)); + StringRef Data(reinterpret_cast(Image.ImageStart), + reinterpret_cast(Image.ImageEnd) - + reinterpret_cast(Image.ImageStart)); std::unique_ptr MB = MemoryBuffer::getMemBuffer( Data, /* BufferName */ "", /* RequiresNullTerminator */ false); if (!MB) @@ -384,37 +389,8 @@ } auto ActualTriple = FOrErr->TheReader.getTargetTriple(); + auto BitcodeTA = Triple(ActualTriple).getArch(); + BitcodeImageMap[Image.ImageStart] = BitcodeTA; - if (Triple(ActualTriple).getArch() == TA) { - BitcodeImageMap[Image->ImageStart] = TA; - return true; - } - - return false; + return BitcodeTA == TT.getArch(); } - -Expected<__tgt_device_image *> compile(__tgt_device_image *Image, - Triple::ArchType TA, std::string MCPU, - unsigned OptLevel, - PostProcessingFn PostProcessing) { - JITEngine J(TA, MCPU); - - auto ImageMBOrErr = J.run(Image, OptLevel, PostProcessing); - if (!ImageMBOrErr) - return ImageMBOrErr.takeError(); - - JITImages.push_back(std::move(*ImageMBOrErr)); - TgtImages.push_back(*Image); - - auto &ImageMB = JITImages.back(); - auto *NewImage = &TgtImages.back(); - - NewImage->ImageStart = (void *)ImageMB->getBufferStart(); - NewImage->ImageEnd = (void *)ImageMB->getBufferEnd(); - - return NewImage; -} - -} // namespace jit -} // namespace omp -} // namespace llvm Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -21,6 +21,7 @@ #include "Debug.h" #include "DeviceEnvironment.h" #include "GlobalHandler.h" +#include "JIT.h" #include "MemoryManager.h" #include "Utilities.h" #include "omptarget.h" @@ -37,6 +38,7 @@ namespace llvm { namespace omp { namespace target { + namespace plugin { struct GenericPluginTy; @@ -378,10 +380,8 @@ } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } - /// Get target architecture. - virtual std::string getArch() const { - return "unknown"; - } + /// Get target compute unit kind (e.g., sm_80, or gfx908). + virtual std::string getComputeUnitKind() const { return "unknown"; } /// Post processing after jit backend. The ownership of \p MB will be taken. virtual Expected> @@ -514,7 +514,8 @@ /// Construct a plugin instance. GenericPluginTy() - : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr) {} + : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), + JIT(getTripleArch()) {} virtual ~GenericPluginTy() {} @@ -558,6 +559,10 @@ return *GlobalHandler; } + /// Get the reference to the JIT used for all devices connected to this + /// plugin. + JITEngine &getJIT() { return JIT; } + /// Get the OpenMP requires flags set for this plugin. int64_t getRequiresFlags() const { return RequiresFlags; } @@ -609,6 +614,9 @@ /// Internal allocator for different structures. BumpPtrAllocator Allocator; + + /// The JIT engine shared by all devices connected to this plugin. + JITEngine JIT; }; /// Class for simplifying the getter operation of the plugin. Anywhere on the Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -664,7 +664,7 @@ if (elf_check_machine(TgtImage, Plugin::get().getMagicElfBits())) return true; - return jit::checkBitcodeImage(TgtImage, Plugin::get().getTripleArch()); + return Plugin::get().getJIT().checkBitcodeImage(*TgtImage); } int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage, @@ -737,34 +737,14 @@ GenericPluginTy &Plugin = Plugin::get(); GenericDeviceTy &Device = Plugin.getDevice(DeviceId); - // If it is a bitcode image, we have to jit the binary image before loading to - // the device. - { - // TODO: Move this (at least the environment variable) into the JIT.h. - UInt32Envar JITOptLevel("LIBOMPTARGET_JIT_OPT_LEVEL", 3); - Triple::ArchType TA = Plugin.getTripleArch(); - std::string Arch = Device.getArch(); - - jit::PostProcessingFn PostProcessing = - [&Device](std::unique_ptr MB) - -> Expected> { - return Device.doJITPostProcessing(std::move(MB)); - }; - - if (jit::checkBitcodeImage(TgtImage, TA)) { - auto TgtImageOrErr = - jit::compile(TgtImage, TA, Arch, JITOptLevel, PostProcessing); - if (!TgtImageOrErr) { - auto Err = TgtImageOrErr.takeError(); - REPORT("Failure to jit binary image from bitcode image %p on device " - "%d: %s\n", - TgtImage, DeviceId, toString(std::move(Err)).data()); - return nullptr; - } - - TgtImage = *TgtImageOrErr; - } + auto ImageOrErr = Plugin.getJIT().process(*TgtImage, Device); + if (!ImageOrErr) { + auto Err = ImageOrErr.takeError(); + REPORT("Failure to jit IR image %p on device %d: %s\n", TgtImage, DeviceId, + toString(std::move(Err)).data()); + return nullptr; } + TgtImage = ImageOrErr.get(); auto TableOrErr = Device.loadBinary(Plugin, TgtImage); if (!TableOrErr) { Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -784,8 +784,10 @@ return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"); } - /// See GenericDeviceTy::getArch(). - std::string getArch() const override { return ComputeCapability.str(); } + /// See GenericDeviceTy::getComputeUnitKind(). + std::string getComputeUnitKind() const override { + return ComputeCapability.str(); + } private: using CUDAStreamManagerTy = GenericDeviceResourceManagerTy;