diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -17,6 +17,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/Triple.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/X86TargetParser.h" @@ -45,6 +46,28 @@ 272 // ptr64 }; +static const unsigned X86VGPUAddrSpaceMap[] = { + 0, // Default + 1, // opencl_global + 3, // opencl_local + 4, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 1, // opencl_global_device + 1, // opencl_global_host + 1, // cuda_device + 4, // cuda_constant + 3, // cuda_shared + 1, // sycl_global + 0, // sycl_global_device + 0, // sycl_global_host + 3, // sycl_local + 0, // sycl_private + 270, // ptr32_sptr + 271, // ptr32_uptr + 272 // ptr64 +}; + // X86 target abstract base class; x86-32 and x86-64 are very close, so // most of the implementation can be shared. class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { @@ -161,6 +184,9 @@ getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF(); if (IsWinCOFF) MaxVectorAlign = MaxTLSAlign = 8192u * getCharWidth(); + + if (Triple.getVendor() == llvm::Triple::VGPU) + AddrSpaceMap = &X86VGPUAddrSpaceMap; } const char *getLongDoubleMangling() const override { @@ -387,6 +413,10 @@ uint64_t getPointerAlignV(unsigned AddrSpace) const override { return getPointerWidthV(AddrSpace); } + + const llvm::omp::GV &getGridValue() const override { + return llvm::omp::VirtualGpuGridValues; + } }; // X86-32 generic target diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.h @@ -0,0 +1,44 @@ +//== CGOpenMPRuntimeVirtualGPU.h - Interface to OpenMP Virtual GPU Runtimes ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// virtual GPU from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H +#define LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H + +#include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtOpenMP.h" + +namespace clang { +namespace CodeGen { + +class CGOpenMPRuntimeVirtualGPU final : public CGOpenMPRuntimeGPU { + +public: + explicit CGOpenMPRuntimeVirtualGPU(CodeGenModule &CGM); + + /// Get the GPU warp size. + llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; + + /// Get the id of the current thread on the GPU. + llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; + + void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, + uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) override; +}; + +} // namespace CodeGen +} // namespace clang + +#endif // LLVM_CLANG_LIB_CODEGEN_CGOpenMPRuntimeVirtualGPU_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeVirtualGPU.cpp @@ -0,0 +1,54 @@ +//= CGOpenMPRuntimeVirtualGPU.cpp - Interface to OpenMP Virtual GPU Runtimes =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// virtual GPU target from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeVirtualGPU.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/Attr.h" +#include "clang/AST/DeclOpenMP.h" +#include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/IntrinsicsNVPTX.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm::omp; + +CGOpenMPRuntimeVirtualGPU::CGOpenMPRuntimeVirtualGPU(CodeGenModule &CGM) + : CGOpenMPRuntimeGPU(CGM) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP Virtual GPU can only handle device code."); +} + +llvm::Value *CGOpenMPRuntimeVirtualGPU::getGPUWarpSize(CodeGenFunction &CGF) { + ArrayRef Args{}; + return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_get_warp_size), + Args); +} + +llvm::Value *CGOpenMPRuntimeVirtualGPU::getGPUThreadID(CodeGenFunction &CGF) { + ArrayRef Args{}; + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_get_thread_id_in_block), + Args); +} + +void CGOpenMPRuntimeVirtualGPU::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) { + CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage); +} diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -62,6 +62,7 @@ CGOpenMPRuntimeAMDGCN.cpp CGOpenMPRuntimeGPU.cpp CGOpenMPRuntimeNVPTX.cpp + CGOpenMPRuntimeVirtualGPU.cpp CGRecordLayoutBuilder.cpp CGStmt.cpp CGStmtOpenMP.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -21,6 +21,7 @@ #include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" +#include "CGOpenMPRuntimeVirtualGPU.h" #include "CodeGenFunction.h" #include "CodeGenPGO.h" #include "ConstantEmitter.h" @@ -254,7 +255,9 @@ OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this)); break; default: - if (LangOpts.OpenMPSimd) + if (getTriple().getVendor() == llvm::Triple::VGPU) { + OpenMPRuntime.reset(new CGOpenMPRuntimeVirtualGPU(*this)); + } else if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); else OpenMPRuntime.reset(new CGOpenMPRuntime(*this)); diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -3078,4 +3078,13 @@ if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); + + if (DriverArgs.hasArg(options::OPT_S)) + return; + + if (getTriple().getVendor() == llvm::Triple::VGPU) { + std::string BitcodeSuffix = "x86_64-vgpu"; + clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, + BitcodeSuffix, getTriple()); + } } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3983,8 +3983,10 @@ } // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options - Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && - Args.hasArg(options::OPT_fopenmp_cuda_mode); + Opts.OpenMPCUDAMode = + Opts.OpenMPIsDevice && + (T.isNVPTX() || T.isAMDGCN() || T.getVendor() == llvm::Triple::VGPU) && + Args.hasArg(options::OPT_fopenmp_cuda_mode); // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Opts.OpenMPCUDAForceFullRuntime = diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -162,7 +162,8 @@ Mesa, SUSE, OpenEmbedded, - LastVendorType = OpenEmbedded + VGPU, + LastVendorType = VGPU }; enum OSType { UnknownOS, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -114,6 +114,16 @@ 128, // GV_Default_WG_Size }; +/// For Virtual GPUs +static constexpr GV VirtualGpuGridValues = { + 256, // GV_Slot_Size + 32, // GV_Warp_Size + 1024, // GV_Max_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size + 128, // GV_Defaut_WG_Size +}; + } // namespace omp } // namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -222,6 +222,7 @@ __OMP_RTL(omp_get_level, false, Int32, ) __OMP_RTL(omp_get_ancestor_thread_num, false, Int32, Int32) __OMP_RTL(omp_get_team_size, false, Int32, Int32) +__OMP_RTL(omp_get_team_num, false, Int32, ) __OMP_RTL(omp_get_active_level, false, Int32, ) __OMP_RTL(omp_in_final, false, Int32, ) __OMP_RTL(omp_get_proc_bind, false, Int32, ) @@ -454,6 +455,10 @@ __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__kmpc_get_warp_size, false, Int32, ) +__OMP_RTL(__kmpc_get_block_size, false, Int32, ) +__OMP_RTL(__kmpc_get_thread_id_in_block, false, Int32, ) + __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32) __OMP_RTL(__last, false, Void, ) @@ -645,6 +650,7 @@ __OMP_RTL_ATTRS(omp_get_ancestor_thread_num, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_team_size, GetterAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_team_num, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_active_level, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_in_final, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_proc_bind, GetterAttrs, AttributeSet(), ParamAttrs()) diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -179,6 +179,8 @@ case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case VGPU: + return "vgpu"; } llvm_unreachable("Invalid VendorType!"); @@ -482,22 +484,23 @@ static Triple::VendorType parseVendor(StringRef VendorName) { return StringSwitch(VendorName) - .Case("apple", Triple::Apple) - .Case("pc", Triple::PC) - .Case("scei", Triple::SCEI) - .Case("sie", Triple::SCEI) - .Case("fsl", Triple::Freescale) - .Case("ibm", Triple::IBM) - .Case("img", Triple::ImaginationTechnologies) - .Case("mti", Triple::MipsTechnologies) - .Case("nvidia", Triple::NVIDIA) - .Case("csr", Triple::CSR) - .Case("myriad", Triple::Myriad) - .Case("amd", Triple::AMD) - .Case("mesa", Triple::Mesa) - .Case("suse", Triple::SUSE) - .Case("oe", Triple::OpenEmbedded) - .Default(Triple::UnknownVendor); + .Case("apple", Triple::Apple) + .Case("pc", Triple::PC) + .Case("scei", Triple::SCEI) + .Case("sie", Triple::SCEI) + .Case("fsl", Triple::Freescale) + .Case("ibm", Triple::IBM) + .Case("img", Triple::ImaginationTechnologies) + .Case("mti", Triple::MipsTechnologies) + .Case("nvidia", Triple::NVIDIA) + .Case("csr", Triple::CSR) + .Case("myriad", Triple::Myriad) + .Case("amd", Triple::AMD) + .Case("mesa", Triple::Mesa) + .Case("suse", Triple::SUSE) + .Case("oe", Triple::OpenEmbedded) + .Case("vgpu", Triple::VGPU) + .Default(Triple::UnknownVendor); } static Triple::OSType parseOS(StringRef OSName) { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -43,6 +43,9 @@ #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/CodeExtractor.h" +#include +#include + using namespace llvm; using namespace omp; @@ -785,6 +788,12 @@ if (PrintOpenMPKernels) printKernels(); + { + llvm::Triple Triple(M.getTargetTriple()); + if (Triple.getVendor() == llvm::Triple::VGPU) + Changed |= expandSharedVariable(); + } + Changed |= runAttributor(IsModulePass); // Recollect uses, in case Attributor deleted any. @@ -1771,6 +1780,13 @@ /// the cases we can avoid taking the address of a function. bool rewriteDeviceCodeStateMachine(); + /// Expand shared variables when the target doesn't support, such as host + /// offloading. Take \p ThreadStates as example here: + /// ThreadStates->xxx + /// will be expanded to the following form: + /// ThreadStatesArray[getThreadId]->xxx + bool expandSharedVariable(); + /// ///}} @@ -2063,6 +2079,102 @@ return Changed; } +static bool replaceUses(Constant &G, + SmallVectorImpl &InstructionStack, + SmallPtrSetImpl &SeenConstants) { + bool Changed = false; + + // Go through every Use of G and replace the use accordingly + SmallVector Uses(make_pointer_range(G.uses())); + for (auto *U : Uses) { + if (auto *C = dyn_cast(U->getUser())) { + assert(SeenConstants.insert(C).second && + "Constant has two operands that need to be replaced, not " + "supported yet!"); + Instruction *ConstAsInst = C->getAsInstruction(); + ConstAsInst->setOperand(U->getOperandNo(), InstructionStack.back()); + InstructionStack.push_back(ConstAsInst); + Changed |= replaceUses(*C, InstructionStack, SeenConstants); + assert(InstructionStack.back() == ConstAsInst && "Stack broken!"); + InstructionStack.pop_back(); + ConstAsInst->deleteValue(); + } else if (auto *UserI = dyn_cast(U->getUser())) { + Instruction *LastInst = nullptr; + Instruction *InstClone = nullptr; + for (auto *Inst : InstructionStack) { + Instruction *LastInstClone = InstClone; + InstClone = Inst->clone(); + if (LastInstClone) + InstClone->replaceUsesOfWith(LastInst, LastInstClone); + Instruction *IP = UserI; + if (auto *PHI = dyn_cast(UserI)) + IP = PHI->getIncomingBlock(*U)->getTerminator(); + InstClone->insertBefore(IP); + LastInst = Inst; + } + // assert(InstClone && InstClone->getNextNode() == UserI); + UserI->setOperand(U->getOperandNo(), InstClone); + Changed = true; + } else { + U->getUser()->dump(); + // llvm_unreachable("Unknown User!\n"); + } + } + + return Changed; +} + +bool OpenMPOpt::expandSharedVariable() { + bool Changed = false; + + constexpr const uint64_t MaxNumBlocks = 256; + + FunctionCallee BlockIdFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL_omp_get_team_num); + + auto *Int32Zero = ConstantInt::get(OMPInfoCache.OMPBuilder.Int32, 0); + SmallPtrSet SeenConstants; + SmallVector InstructionStack; + + for (auto &G : M.globals()) { + // TODO: Rewrite with enum value + if (G.getAddressSpace() != 3) + continue; + + // Create a new array + PointerType *GlobalType = + PointerType::get(G.getType()->getElementType(), 3); + ArrayType *TheArrayType = + ArrayType::get(GlobalType->getElementType(), MaxNumBlocks); + Constant *TheArray = new GlobalVariable( + M, TheArrayType, /* isConstant */ false, GlobalValue::PrivateLinkage, + UndefValue::get(TheArrayType), G.getName() + ".array"); + + TheArray = ConstantExpr::getAddrSpaceCast( + TheArray, + PointerType::get(TheArray->getType()->getPointerElementType(), 3)); + + auto *BlockId = CallInst::Create(BlockIdFn, {}, "block_id"); + InstructionStack.push_back(BlockId); + auto *NewElement = GetElementPtrInst::Create( + nullptr, TheArray, {Int32Zero, BlockId}, G.getName()); + InstructionStack.push_back(NewElement); + + Changed |= replaceUses(G, InstructionStack, SeenConstants); + assert(InstructionStack.back() == NewElement && "Broken stack!"); + + InstructionStack.pop_back(); + NewElement->deleteValue(); + assert(InstructionStack.back() == BlockId && "Broken stack!"); + + InstructionStack.pop_back(); + BlockId->deleteValue(); + assert(InstructionStack.empty() && "Broken stack!"); + } + + return Changed; +} + /// Abstract Attribute for tracking ICV values. struct AAICVTracker : public StateWrapper { using Base = StateWrapper; diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn --- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn @@ -68,6 +68,7 @@ "CGOpenMPRuntimeAMDGCN.cpp", "CGOpenMPRuntimeGPU.cpp", "CGOpenMPRuntimeNVPTX.cpp", + "CGOpenMPRuntimeVirtualGPU.cpp", "CGRecordLayoutBuilder.cpp", "CGStmt.cpp", "CGStmtOpenMP.cpp", diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -39,6 +39,8 @@ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) endif() + + list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include) endif() # Check and set up common compiler flags. diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -270,3 +270,68 @@ # Install bitcode library under the lib destination folder. install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") endforeach() + +set(bc_flags -S -x c++ -std=c++20 + ${clang_opt_flags} + -target x86_64-vgpu + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device + -Xclang -target-feature + -I${include_directory} + -I${devicertl_base_directory}/../include + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL} + -stdlib=libc++ +) + +add_custom_target(omptarget-vgpu-bc) + +set(bc_files "") +foreach(src ${src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-vgpu.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${cuda_compiler} ${bc_flags} + ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) +endforeach() + +set(bclib_name "libomptarget-x86_64-vgpu.bc") + +# Link to a bitcode library. +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${bc_linker} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" +) + +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt + COMMAND ${opt} + -O1 -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMENT "Optimizing LLVM bitcode ${bclib_name}" +) +set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) + +set(bclib_target_name "omptarget-x86_64-vgpu-bc") + +add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt) +add_dependencies(omptarget-vgpu-bc ${bclib_target_name}) + +# Copy library to destination. +add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + +# Install bitcode library under the lib destination folder. +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -352,6 +352,16 @@ int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); ///} + +/// Target properties +/// +///{ +uint32_t __kmpc_get_warp_size(); + +uint32_t __kmpc_get_block_size(); + +uint32_t __kmpc_get_thread_id_in_block(); +///} } #endif diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -103,6 +103,8 @@ if (IsSPMD) return; + synchronize::threads(); + // Signal the workers to exit the state machine and exit the kernel. state::ParallelRegionFn = nullptr; } diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -156,14 +156,90 @@ mapping::getWarpSize(); } +uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } + #pragma omp end declare variant ///} -uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } +} // namespace impl +} // namespace _OMP + +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(x86, x86_64)}, implementation = {extension(match_any)}) + +#include "DeviceEnvironment.h" + +namespace _OMP { +namespace impl { + +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::VirtualGpuGridValues; +} + +LaneMaskTy activemask() { + uint64_t B = 0; + uint32_t N = mapping::getWarpSize(); + while (N) + B |= (1 << (--N)); + return B; +} + +LaneMaskTy lanemaskLT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { + return getThreadEnvironment()->getThreadIdInBlock(); +} + +uint32_t getBlockSize() { return getThreadEnvironment()->getBlockSize(); } + +uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); } + +uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); } + +uint32_t getNumberOfBlocks() { + return getThreadEnvironment()->getNumberOfBlocks(); +} + +uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); } + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} } // namespace impl } // namespace _OMP +#pragma omp end declare variant +///} + bool mapping::isMainThreadInGenericMode(bool IsSPMD) { if (IsSPMD || icv::Level) return false; @@ -237,5 +313,13 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { return mapping::getNumberOfProcessorElements(); } + +__attribute__((noinline)) uint32_t __kmpc_get_warp_size() { + return mapping::getWarpSize(); +} + +__attribute__((noinline)) uint32_t __kmpc_get_block_size() { + return mapping::getBlockSize(); +} } #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -16,10 +16,9 @@ namespace _OMP { namespace impl { -/// AMDGCN Implementation +/// Generic Implementation - AMDGCN, VGPU /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) double getWTick() { return ((double)1E-9); } @@ -31,8 +30,6 @@ return 0; } -#pragma omp end declare variant - /// NVPTX Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -214,6 +214,71 @@ } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(x86, x86_64)}, implementation = {extension(match_any)}) + +#include "DeviceEnvironment.h" +namespace impl { + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return VGPUImpl::atomicInc(Address, Val, Ordering); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % mapping::getWarpSize() == 0); + getThreadEnvironment()->namedBarrier(true); +} + +void fenceTeam(int) { getThreadEnvironment()->fenceTeam(); } + +void fenceKernel(int memory_order) { + getThreadEnvironment()->fenceKernel(memory_order); +} + +// Simply call fenceKernel because there is no need to sync with host +void fenceSystem(int) { fenceKernel(0); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { + getThreadEnvironment()->syncWarp(); +} + +void syncThreads() { getThreadEnvironment()->namedBarrier(false); } + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); +} + +int testLock(omp_lock_t *Lock) { + return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + VGPUImpl::setLock((uint32_t *)Lock, UNSET, SET, OMP_SPIN, + mapping::getBlockId(), atomicCAS); +} + +} // namespace impl + +#pragma omp end declare variant +///} + void synchronize::init(bool IsSPMD) { if (!IsSPMD) impl::namedBarrierInit(); diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -45,6 +45,24 @@ #pragma omp end declare variant +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(x86, x86_64)}, implementation = {extension(match_any)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + *LowBits = (uint32_t)(Val & static_cast(0x00000000FFFFFFFF)); + *HighBits = + (uint32_t)((Val & static_cast(0xFFFFFFFF00000000)) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant + /// NVPTX Implementation /// ///{ @@ -109,6 +127,26 @@ #pragma omp end declare variant } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(x86, x86_64)}, implementation = {extension(match_any)}) + +#include "DeviceEnvironment.h" +namespace impl { + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return getThreadEnvironment()->shuffle(Var, SrcLane); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + return getThreadEnvironment()->shuffleDown(Var, Delta); +} + +} // namespace impl +#pragma omp end declare variant + uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { return impl::Pack(LowBits, HighBits); } diff --git a/openmp/libomptarget/include/DeviceEnvironment.h b/openmp/libomptarget/include/DeviceEnvironment.h --- a/openmp/libomptarget/include/DeviceEnvironment.h +++ b/openmp/libomptarget/include/DeviceEnvironment.h @@ -22,4 +22,64 @@ uint32_t DynamicMemSize; }; +using LaneMaskTy = uint64_t; + +// Forward declaration +class WarpEnvironmentTy; +class ThreadBlockEnvironmentTy; +class CTAEnvironmentTy; +namespace VGPUImpl { +class ThreadEnvironmentTy; +void setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, uint32_t OmpSpin, + uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, int)); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); +} // namespace VGPUImpl + +class ThreadEnvironmentTy { + VGPUImpl::ThreadEnvironmentTy *Impl; + +public: + ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE); + + ~ThreadEnvironmentTy(); + + unsigned getThreadIdInWarp() const; + + unsigned getThreadIdInBlock() const; + + unsigned getGlobalThreadId() const; + + unsigned getBlockSize() const; + + unsigned getKernelSize() const; + + unsigned getBlockId() const; + + unsigned getNumberOfBlocks() const; + + LaneMaskTy getActiveMask() const; + + unsigned getWarpSize() const; + + int32_t shuffle(int32_t Var, uint64_t SrcLane); + + int32_t shuffleDown(int32_t Var, uint32_t Delta); + + void fenceKernel(int32_t MemoryOrder); + + void fenceTeam(); + + void syncWarp(); + + void namedBarrier(bool Generic); + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE); + + void resetBlockEnv(); +}; + +ThreadEnvironmentTy *getThreadEnvironment(void); + #endif diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -75,6 +75,7 @@ add_subdirectory(ppc64) add_subdirectory(ppc64le) add_subdirectory(ve) +add_subdirectory(vgpu) add_subdirectory(x86_64) add_subdirectory(remote) diff --git a/openmp/libomptarget/plugins/vgpu/CMakeLists.txt b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt @@ -0,0 +1,58 @@ +set(tmachine_name "vgpu") +set(tmachine_libname "vgpu") +set(tmachine_triple "x86_64-vgpu") +set(elf_machine_id "62") + +if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceEnvironment.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20) + target_compile_options("omptarget.rtl.${tmachine_libname}" PRIVATE "-stdlib=libc++") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + elf_common + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + ${OPENMP_PTHREAD_LIB} + "-rdynamic" + c++ + #"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ) + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS + "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") +endif(LIBOMPTARGET_DEP_LIBELF_FOUND) diff --git a/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp b/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironment.cpp @@ -0,0 +1,118 @@ +//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of VGPU environment classes. +// +//===----------------------------------------------------------------------===// + +#include "DeviceEnvironment.h" +#include "DeviceEnvironmentImpl.h" +#include +#include +#include + +std::mutex AtomicIncLock; + +uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + std::lock_guard G(AtomicIncLock); + uint32_t V = *Address; + if (V >= Val) + *Address = 0; + else + *Address += 1; + return V; +} + +void VGPUImpl::setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, + uint32_t OmpSpin, uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, + int)) { + // TODO: not sure spinning is a good idea here.. + while (atomicCAS((uint32_t *)Lock, Unset, Set, __ATOMIC_SEQ_CST) != Unset) { + std::clock_t start = std::clock(); + std::clock_t now; + for (;;) { + now = std::clock(); + std::clock_t cycles = + now > start ? now - start : now + (0xffffffff - start); + if (cycles >= 1000 * BlockId) { + break; + } + } + } // wait for 0 to be the read value +} + +extern thread_local ThreadEnvironmentTy *ThreadEnvironment; + +ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; } + +ThreadEnvironmentTy::ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : Impl(new VGPUImpl::ThreadEnvironmentTy(Id, WE, CTAE)) {} + +ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; } + +void ThreadEnvironmentTy::fenceTeam() { Impl->fenceTeam(); } + +void ThreadEnvironmentTy::syncWarp() { Impl->syncWarp(); } + +unsigned ThreadEnvironmentTy::getThreadIdInWarp() const { + return Impl->getThreadIdInWarp(); +} + +unsigned ThreadEnvironmentTy::getThreadIdInBlock() const { + return Impl->getThreadIdInBlock(); +} + +unsigned ThreadEnvironmentTy::getGlobalThreadId() const { + return Impl->getGlobalThreadId(); +} + +unsigned ThreadEnvironmentTy::getBlockSize() const { + return Impl->getBlockSize(); +} + +unsigned ThreadEnvironmentTy::getKernelSize() const { + return Impl->getKernelSize(); +} + +unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); } + +unsigned ThreadEnvironmentTy::getNumberOfBlocks() const { + return Impl->getNumberOfBlocks(); +} + +LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { + return Impl->getActiveMask(); +} + +int32_t ThreadEnvironmentTy::shuffle(int32_t Var, uint64_t SrcLane) { + return Impl->shuffle(Var, SrcLane); +} + +int32_t ThreadEnvironmentTy::shuffleDown(int32_t Var, uint32_t Delta) { + return Impl->shuffleDown(Var, Delta); +} + +void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) { + return Impl->fenceKernel(MemoryOrder); +} + +void ThreadEnvironmentTy::namedBarrier(bool Generic) { + Impl->namedBarrier(Generic); +} + +void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + Impl->setBlockEnv(TBE); +} + +void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); } + +unsigned ThreadEnvironmentTy::getWarpSize() const { + return Impl->getWarpSize(); +} diff --git a/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h b/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/DeviceEnvironmentImpl.h @@ -0,0 +1,168 @@ +//===---- DeviceEnvironmentImpl.h - Virtual GPU device environment - C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H + +#include "DeviceEnvironment.h" +#include +#include +#include +#include +#include +#include + +class WarpEnvironmentTy { + const unsigned ID; + const unsigned NumThreads; + + std::vector ShuffleBuffer; + + std::barrier> Barrier; + std::barrier> ShuffleBarrier; + std::barrier> ShuffleDownBarrier; + +public: + WarpEnvironmentTy(unsigned ID, unsigned NumThreads) + : ID(ID), NumThreads(NumThreads), ShuffleBuffer(NumThreads), + Barrier(NumThreads, []() {}), ShuffleBarrier(NumThreads, []() {}), + ShuffleDownBarrier(NumThreads, []() {}) {} + + unsigned getWarpId() const { return ID; } + int getNumThreads() const { return NumThreads; } + + void sync() { Barrier.arrive_and_wait(); } + void writeShuffleBuffer(int32_t Var, unsigned LaneId) { + ShuffleBuffer[LaneId] = Var; + } + + int32_t getShuffleBuffer(unsigned LaneId) { return ShuffleBuffer[LaneId]; } + + void waitShuffleBarrier() { ShuffleBarrier.arrive_and_wait(); } + + void waitShuffleDownBarrier() { ShuffleBarrier.arrive_and_wait(); } +}; + +class CTAEnvironmentTy { +public: + unsigned ID; + unsigned NumThreads; + unsigned NumBlocks; + + std::barrier> Barrier; + std::barrier> SyncThreads; + std::barrier> NamedBarrier; + + CTAEnvironmentTy(unsigned ID, unsigned NumThreads, unsigned NumBlocks) + : ID(ID), NumThreads(NumThreads), NumBlocks(NumBlocks), + Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}), + NamedBarrier(NumThreads, []() {}) {} + + unsigned getId() const { return ID; } + unsigned getNumThreads() const { return NumThreads; } + + unsigned getNumBlocks() const { return NumBlocks; } + + void fence() { Barrier.arrive_and_wait(); } + void syncThreads() { SyncThreads.arrive_and_wait(); } + void namedBarrier() { NamedBarrier.arrive_and_wait(); } +}; + +class ThreadBlockEnvironmentTy { + unsigned ID; + unsigned NumBlocks; + +public: + ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks) + : ID(ID), NumBlocks(NumBlocks) {} + + unsigned getId() const { return ID; } + unsigned getNumBlocks() const { return NumBlocks; } +}; + +namespace VGPUImpl { +class ThreadEnvironmentTy { + unsigned ThreadIdInWarp; + unsigned ThreadIdInBlock; + unsigned GlobalThreadIdx; + + WarpEnvironmentTy *WarpEnvironment; + ThreadBlockEnvironmentTy *ThreadBlockEnvironment; + CTAEnvironmentTy *CTAEnvironment; + +public: + ThreadEnvironmentTy(unsigned ThreadId, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : ThreadIdInWarp(ThreadId), + ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadId), + GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() + + ThreadIdInBlock), + WarpEnvironment(WE), CTAEnvironment(CTAE) {} + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + ThreadBlockEnvironment = TBE; + } + + void resetBlockEnv() { + delete ThreadBlockEnvironment; + ThreadBlockEnvironment = nullptr; + } + + unsigned getThreadIdInWarp() const { return ThreadIdInWarp; } + unsigned getThreadIdInBlock() const { return ThreadIdInBlock; } + unsigned getGlobalThreadId() const { return GlobalThreadIdx; } + + unsigned getBlockSize() const { return CTAEnvironment->getNumThreads(); } + + unsigned getBlockId() const { return ThreadBlockEnvironment->getId(); } + + unsigned getNumberOfBlocks() const { + return ThreadBlockEnvironment->getNumBlocks(); + } + unsigned getKernelSize() const {} + + // FIXME: This is wrong + LaneMaskTy getActiveMask() const { return ~0U; } + + void fenceTeam() { CTAEnvironment->fence(); } + void syncWarp() { WarpEnvironment->sync(); } + + int32_t shuffle(int32_t Var, uint64_t SrcLane) { + WarpEnvironment->waitShuffleBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleBarrier(); + Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp); + return Var; + } + + int32_t shuffleDown(int32_t Var, uint32_t Delta) { + WarpEnvironment->waitShuffleDownBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleDownBarrier(); + Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) % + getWarpSize()); + return Var; + } + + void namedBarrier(bool Generic) { + if (Generic) { + CTAEnvironment->namedBarrier(); + } else { + CTAEnvironment->syncThreads(); + } + } + + void fenceKernel(int32_t MemoryOrder) { + std::atomic_thread_fence(static_cast(MemoryOrder)); + } + + unsigned getWarpSize() const { return WarpEnvironment->getNumThreads(); } +}; +} // namespace VGPUImpl + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_DEVICEENVIRONMENTIMPL_H diff --git a/openmp/libomptarget/plugins/vgpu/src/rtl.cpp b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp @@ -0,0 +1,623 @@ +//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for virtual (x86) GPU +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "DeviceEnvironmentImpl.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#include "elf_common.h" + +#define NUMBER_OF_DEVICES 1 +#define OFFLOADSECTIONNAME "omp_offloading_entries" + +#define DEBUG false + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +thread_local ThreadEnvironmentTy *ThreadEnvironment; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +std::vector CTAEnvironments; +std::vector WarpEnvironments; + +struct VGPUTy { + struct KernelTy { + ffi_cif *Cif; + std::function Kernel; + int NumTeams; + + KernelTy(ffi_cif *Cif, std::function Kernel, int NumTeams) + : Cif(Cif), Kernel(Kernel), NumTeams(NumTeams) {} + }; + + struct VGPUStreamTy { + std::queue Kernels; + std::mutex Mtx; + + void emplace(ffi_cif *Cif, std::function F, int NumTeams) { + std::lock_guard Guard(Mtx); + Kernels.emplace(Cif, F, NumTeams); + } + + KernelTy front() { + std::lock_guard Guard(Mtx); + return Kernels.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Kernels.pop(); + } + + bool empty() { + std::lock_guard Guard(Mtx); + return Kernels.empty(); + } + }; + + struct AsyncInfoQueueTy { + std::deque<__tgt_async_info *> Streams; + std::mutex Mtx; + + bool empty() { + std::lock_guard Guard(Mtx); + return Streams.empty(); + } + + __tgt_async_info *front() { + std::lock_guard Guard(Mtx); + return Streams.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Streams.pop_front(); + } + + void emplace(__tgt_async_info *AsyncInfo) { + std::lock_guard Guard(Mtx); + Streams.emplace_back(AsyncInfo); + } + } ExecutionQueue; + + VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) { + assert(AsyncInfo != nullptr && "async_info ptr was null"); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = new VGPUStreamTy(); + + return reinterpret_cast(AsyncInfo->Queue); + } + + std::atomic Running; + std::vector Threads; + int WarpsPerCTA; + int NumCTAs; + + std::unique_ptr>> Barrier; + std::condition_variable WorkAvailable; + std::mutex WorkDoneMtx; + std::condition_variable WorkDone; + + VGPUTy(int NumThreads = -1, int ThreadsPerWarp = -1, int WarpsPerCTA = -1) + : Running(true) { + if (const char *Env = std::getenv("VGPU_NUM_THREADS")) + NumThreads = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP")) + ThreadsPerWarp = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA")) + WarpsPerCTA = std::stoi(Env); + + if (NumThreads == -1) + NumThreads = std::thread::hardware_concurrency(); + if (ThreadsPerWarp == -1) + ThreadsPerWarp = NumThreads; + if (WarpsPerCTA == -1) + WarpsPerCTA = 1; + + NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA); + + // printf("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n", + // NumThreads, + // ThreadsPerWarp, WarpsPerCTA); + + assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 && + "Invalid VGPU Config"); + + Barrier = std::make_unique>>( + NumThreads, []() {}); + + Threads.reserve(NumThreads); + + auto GlobalThreadIdx = 0; + for (auto CTAIdx = 0; CTAIdx < NumCTAs; CTAIdx++) { + auto *CTAEnv = + new CTAEnvironmentTy(CTAIdx, NumThreads / NumCTAs, NumCTAs); + for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) { + auto *WarpEnv = new WarpEnvironmentTy(WarpIdx, ThreadsPerWarp); + for (auto ThreadIdx = 0; ThreadIdx < ThreadsPerWarp; ThreadIdx++) { + Threads.emplace_back( + [this, ThreadIdx, GlobalThreadIdx, CTAEnv, WarpEnv]() { + ThreadEnvironment = + new ThreadEnvironmentTy(ThreadIdx, WarpEnv, CTAEnv); + std::function Kernel; + while (Running) { + { + std::unique_lock UniqueLock(ExecutionQueue.Mtx); + + WorkAvailable.wait(UniqueLock, [&]() { + if (!Running) { + return true; + } + bool IsEmpty = ExecutionQueue.Streams.empty(); + + return !IsEmpty; + }); + } + + if (ExecutionQueue.empty()) { + continue; + } + + while (!ExecutionQueue.empty()) { + auto *Stream = getStream(ExecutionQueue.front()); + while (!Stream->empty()) { + auto KernelInfo = Stream->front(); + Kernel = KernelInfo.Kernel; + + const unsigned NumTeams = KernelInfo.NumTeams; + unsigned TeamIdx = 0; + while (TeamIdx < KernelInfo.NumTeams) { + if (CTAEnv->getId() < KernelInfo.NumTeams) { + ThreadEnvironment->setBlockEnv( + new ThreadBlockEnvironmentTy( + TeamIdx + CTAEnv->getId(), NumTeams)); + Kernel(); + ThreadEnvironment->resetBlockEnv(); + } + Barrier->arrive_and_wait(); + TeamIdx += NumCTAs; + } + + if (GlobalThreadIdx == 0) { + delete KernelInfo.Cif; + Stream->pop(); + } + + Barrier->arrive_and_wait(); + } + if (GlobalThreadIdx == 0) { + ExecutionQueue.pop(); + WorkDone.notify_all(); + } + Barrier->arrive_and_wait(); + } + } + delete ThreadEnvironment; + }); + GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads; + } + WarpEnvironments.push_back(WarpEnv); + } + CTAEnvironments.push_back(CTAEnv); + } + } + + ~VGPUTy() { + awaitAll(); + + Running = false; + WorkAvailable.notify_all(); + + for (auto &Thread : Threads) { + if (Thread.joinable()) { + Thread.join(); + } + } + + for (auto *CTAEnv : CTAEnvironments) + delete CTAEnv; + + for (auto *WarpEnv : WarpEnvironments) + delete WarpEnv; + } + + void await(__tgt_async_info *AsyncInfo) { + std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx); + WorkDone.wait(UniqueLock, + [&]() { return getStream(AsyncInfo)->Kernels.empty(); }); + } + + void awaitAll() { + while (!ExecutionQueue.empty()) { + await(ExecutionQueue.front()); + } + } + + void scheduleAsync(__tgt_async_info *AsyncInfo, ffi_cif *Cif, + std::function F, int NumTeams) { + if (NumTeams == 0) + NumTeams = NumCTAs; + auto *Stream = getStream(AsyncInfo); + Stream->emplace(Cif, F, NumTeams); + ExecutionQueue.emplace(AsyncInfo); + WorkAvailable.notify_all(); + } +}; + +VGPUTy VGPU; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid + // the dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +// Sample implementation of explicit memory allocator. For this plugin all +// kinds are equivalent to each other. +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind) { + void *ptr = NULL; + + switch (kind) { + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEFAULT: + ptr = malloc(size); + break; + default: + REPORT("Invalid target data allocation kind"); + } + + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + VGPU.await(async_info); + delete (VGPUTy::VGPUStreamTy *)async_info->Queue; + async_info->Queue = nullptr; + return 0; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount) { + __tgt_async_info AsyncInfo; + int rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &AsyncInfo); + + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info) { + ffi_cif *cif = new ffi_cif(); + + // All args are references. + std::shared_ptr> args_types = + std::make_shared>(arg_num, &ffi_type_pointer); + std::shared_ptr> args = + std::make_shared>(arg_num); + std::shared_ptr> ptrs = + std::make_shared>(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + (*ptrs)[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + (*args)[i] = &(*ptrs)[i]; + } + + ffi_status status = ffi_prep_cif(cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &(*args_types)[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + void (*entry)(void); + *((void **)&entry) = tgt_entry_ptr; + + VGPU.scheduleAsync( + async_info, cif, + [&]() { + ffi_call(cif, entry, NULL, &(*args)[0]); + &(args_types); + }, + team_num); + VGPU.await(async_info); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { + return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr, + tgt_args, tgt_offsets, arg_num, + 1, 1, 0, async_info); +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -34,6 +34,7 @@ /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", /* Remote target */ "libomptarget.rtl.rpc.so", + /* Virtual GPU target */ "libomptarget.rtl.vgpu.so", }; PluginManager *PM; @@ -83,7 +84,7 @@ // is correct and if they are supporting any devices. for (auto *Name : RTLNames) { DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); + void *dynlib_handle = dlopen(Name, RTLD_NOW | RTLD_GLOBAL); if (!dynlib_handle) { // Library does not exist or cannot be found.