diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -17,6 +17,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/Triple.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/X86TargetParser.h" @@ -45,6 +46,28 @@ 272 // ptr64 }; +static const unsigned X86VGPUAddrSpaceMap[] = { + 0, // Default + 1, // opencl_global + 3, // opencl_local + 4, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 1, // opencl_global_device + 1, // opencl_global_host + 1, // cuda_device + 4, // cuda_constant + 3, // cuda_shared + 1, // sycl_global + 0, // sycl_global_device + 0, // sycl_global_host + 3, // sycl_local + 0, // sycl_private + 270, // ptr32_sptr + 271, // ptr32_uptr + 272 // ptr64 +}; + // X86 target abstract base class; x86-32 and x86-64 are very close, so // most of the implementation can be shared. class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { @@ -162,6 +185,9 @@ getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF(); if (IsWinCOFF) MaxVectorAlign = MaxTLSAlign = 8192u * getCharWidth(); + + if (Triple.getVendor() == llvm::Triple::OpenMP_VGPU) + AddrSpaceMap = &X86VGPUAddrSpaceMap; } const char *getLongDoubleMangling() const override { @@ -388,6 +414,10 @@ uint64_t getPointerAlignV(unsigned AddrSpace) const override { return getPointerWidthV(AddrSpace); } + + const llvm::omp::GV &getGridValue() const override { + return llvm::omp::VirtualGpuGridValues; + } }; // X86-32 generic target diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1119,10 +1119,11 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, - llvm::Constant *Addr, - uint64_t Size, int32_t, - llvm::GlobalValue::LinkageTypes) { +void CGOpenMPRuntimeGPU::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) { + if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) + return CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage); // TODO: Add support for global variables on the device after declare target // support. if (!isa(Addr)) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -249,7 +249,9 @@ OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: - if (LangOpts.OpenMPSimd) + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) { + OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); + } else if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); else OpenMPRuntime.reset(new CGOpenMPRuntime(*this)); diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -3074,4 +3074,13 @@ if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); + + if (DriverArgs.hasArg(options::OPT_S)) + return; + + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) { + std::string BitcodeSuffix = "x86_64-vgpu"; + clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, + BitcodeSuffix, getTriple()); + } } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3983,7 +3983,9 @@ } // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options - Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && + (T.isNVPTX() || T.isAMDGCN() || + T.getVendor() == llvm::Triple::OpenMP_VGPU) && Args.hasArg(options::OPT_fopenmp_cuda_mode); // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -164,7 +164,8 @@ Mesa, SUSE, OpenEmbedded, - LastVendorType = OpenEmbedded + OpenMP_VGPU, + LastVendorType = OpenMP_VGPU }; enum OSType { UnknownOS, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -114,6 +114,16 @@ 128, // GV_Default_WG_Size }; +/// For Virtual GPUs +static constexpr GV VirtualGpuGridValues = { + 256, // GV_Slot_Size + 32, // GV_Warp_Size + 1024, // GV_Max_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size + 128, // GV_Defaut_WG_Size +}; + } // namespace omp } // namespace llvm diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -185,6 +185,8 @@ case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case OpenMP_VGPU: + return "vgpu"; } llvm_unreachable("Invalid VendorType!"); @@ -492,22 +494,23 @@ static Triple::VendorType parseVendor(StringRef VendorName) { return StringSwitch(VendorName) - .Case("apple", Triple::Apple) - .Case("pc", Triple::PC) - .Case("scei", Triple::SCEI) - .Case("sie", Triple::SCEI) - .Case("fsl", Triple::Freescale) - .Case("ibm", Triple::IBM) - .Case("img", Triple::ImaginationTechnologies) - .Case("mti", Triple::MipsTechnologies) - .Case("nvidia", Triple::NVIDIA) - .Case("csr", Triple::CSR) - .Case("myriad", Triple::Myriad) - .Case("amd", Triple::AMD) - .Case("mesa", Triple::Mesa) - .Case("suse", Triple::SUSE) - .Case("oe", Triple::OpenEmbedded) - .Default(Triple::UnknownVendor); + .Case("apple", Triple::Apple) + .Case("pc", Triple::PC) + .Case("scei", Triple::SCEI) + .Case("sie", Triple::SCEI) + .Case("fsl", Triple::Freescale) + .Case("ibm", Triple::IBM) + .Case("img", Triple::ImaginationTechnologies) + .Case("mti", Triple::MipsTechnologies) + .Case("nvidia", Triple::NVIDIA) + .Case("csr", Triple::CSR) + .Case("myriad", Triple::Myriad) + .Case("amd", Triple::AMD) + .Case("mesa", Triple::Mesa) + .Case("suse", Triple::SUSE) + .Case("oe", Triple::OpenEmbedded) + .Case("vgpu", Triple::OpenMP_VGPU) + .Default(Triple::UnknownVendor); } static Triple::OSType parseOS(StringRef OSName) { diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -39,6 +39,8 @@ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) endif() + + list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include) endif() # Check and set up common compiler flags. diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -132,6 +132,7 @@ -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device -I${include_directory} -I${devicertl_base_directory}/../include + -I${devicertl_base_directory}/../plugins/vgpu/src ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL} ) @@ -153,7 +154,6 @@ add_custom_command(OUTPUT ${outfile} COMMAND ${CLANG_TOOL} ${bc_flags} - -Xclang -target-cpu -Xclang ${target_cpu} ${target_bc_flags} ${infile} -o ${outfile} DEPENDS ${infile} @@ -222,9 +222,11 @@ # Generate a Bitcode library for all the compute capabilities the user requested foreach(sm ${nvptx_sm_list}) - compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") + compileDeviceRTLLibrary(sm_${sm} nvptx -Xclang -target-cpu -Xclang sm_${sm} -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") endforeach() foreach(mcpu ${amdgpu_mcpus}) - compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib) + compileDeviceRTLLibrary(${mcpu} amdgpu -Xclang -target-cpu -Xclang ${mcpu} -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib) endforeach() + +compileDeviceRTLLibrary(vgpu x86_64-vgpu -target x86_64-vgpu -std=c++20 -stdlib=libc++) diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -49,6 +49,16 @@ } // namespace impl #pragma omp end declare variant +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) +int32_t vprintf(const char *, void *); +namespace impl { +static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return vprintf(Format, Arguments); +} +} // namespace impl +#pragma omp end declare variant + int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) { return impl::omp_vprintf(Format, Arguments, Size); } diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -110,6 +110,22 @@ state::ParallelRegionFn = nullptr; } +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) +void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) { + FunctionTracingRAII(); + const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; + state::assumeInitialState(IsSPMD); + if (IsSPMD) + return; + + // Signal the workers to exit the state machine and exit the kernel. + state::ParallelRegionFn = nullptr; + + synchronize::threads(); +} +#pragma omp end declare variant + int8_t __kmpc_is_spmd_exec_mode() { FunctionTracingRAII(); return mapping::isSPMDMode(); diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -21,6 +21,83 @@ using namespace _OMP; +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" + +namespace _OMP { +namespace impl { + +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::VirtualGpuGridValues; +} + +LaneMaskTy activemask() { + uint64_t B = 0; + uint32_t N = mapping::getWarpSize(); + while (N) + B |= (1 << (--N)); + return B; +} + +LaneMaskTy lanemaskLT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { + return getThreadEnvironment()->getThreadIdInBlock(); +} + +uint32_t getNumHardwareThreadsInBlock() { + return getThreadEnvironment()->getBlockSize(); +} + +uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); } + +uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); } + +uint32_t getNumberOfBlocks() { + return getThreadEnvironment()->getNumberOfBlocks(); +} + +uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); } + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} + +} // namespace impl +} // namespace _OMP + +#pragma omp end declare variant + namespace _OMP { namespace impl { diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -18,10 +18,9 @@ namespace _OMP { namespace impl { -/// AMDGCN Implementation +/// Generic Implementation - AMDGCN, VGPU /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) double getWTick() { return ((double)1E-9); } @@ -33,8 +32,6 @@ return 0; } -#pragma omp end declare variant - /// NVPTX Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -283,6 +283,73 @@ } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" +namespace impl { + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return VGPUImpl::atomicInc(Address, Val, Ordering); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % mapping::getWarpSize() == 0); + getThreadEnvironment()->namedBarrier(true); +} + +void fenceTeam(int) { getThreadEnvironment()->fenceTeam(); } + +void fenceKernel(int memory_order) { + getThreadEnvironment()->fenceKernel(memory_order); +} + +// Simply call fenceKernel because there is no need to sync with host +void fenceSystem(int) { fenceKernel(0); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { + getThreadEnvironment()->syncWarp(); +} + +void syncThreads() { getThreadEnvironment()->namedBarrier(false); } + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); +} + +int testLock(omp_lock_t *Lock) { + return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + VGPUImpl::setLock((uint32_t *)Lock, UNSET, SET, OMP_SPIN, + mapping::getBlockId(), atomicCAS); +} + +void syncThreadsAligned() {} + +} // namespace impl + +#pragma omp end declare variant +///} + void synchronize::init(bool IsSPMD) { if (!IsSPMD) impl::namedBarrierInit(); diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -49,6 +49,24 @@ #pragma omp end declare variant +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + *LowBits = (uint32_t)(Val & static_cast(0x00000000FFFFFFFF)); + *HighBits = + (uint32_t)((Val & static_cast(0xFFFFFFFF00000000)) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant + /// NVPTX Implementation /// ///{ @@ -113,6 +131,26 @@ #pragma omp end declare variant } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" +namespace impl { + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return getThreadEnvironment()->shuffle(Var, SrcLane); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + return getThreadEnvironment()->shuffleDown(Var, Delta); +} + +} // namespace impl +#pragma omp end declare variant + uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { return impl::Pack(LowBits, HighBits); } diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -75,6 +75,7 @@ add_subdirectory(ppc64) add_subdirectory(ppc64le) add_subdirectory(ve) +add_subdirectory(vgpu) add_subdirectory(x86_64) add_subdirectory(remote) diff --git a/openmp/libomptarget/plugins/vgpu/CMakeLists.txt b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt @@ -0,0 +1,58 @@ +set(tmachine_name "vgpu") +set(tmachine_libname "vgpu") +set(tmachine_triple "x86_64-vgpu") +set(elf_machine_id "62") + +if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironment.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20) + target_compile_options("omptarget.rtl.${tmachine_libname}" PRIVATE "-stdlib=libc++") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + elf_common + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + ${OPENMP_PTHREAD_LIB} + "-rdynamic" + c++ + #"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ) + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS + "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") +endif(LIBOMPTARGET_DEP_LIBELF_FOUND) diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h @@ -0,0 +1,72 @@ +//===---- ThreadEnvironment.h - Virtual GPU thread environment ----- C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H + +using LaneMaskTy = uint64_t; + +// Forward declaration +class WarpEnvironmentTy; +class ThreadBlockEnvironmentTy; +class CTAEnvironmentTy; +namespace VGPUImpl { +class ThreadEnvironmentTy; +void setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, uint32_t OmpSpin, + uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, int)); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); +} // namespace VGPUImpl + +class ThreadEnvironmentTy { + VGPUImpl::ThreadEnvironmentTy *Impl; + +public: + ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE); + + ~ThreadEnvironmentTy(); + + unsigned getThreadIdInWarp() const; + + unsigned getThreadIdInBlock() const; + + unsigned getGlobalThreadId() const; + + unsigned getBlockSize() const; + + unsigned getKernelSize() const; + + unsigned getBlockId() const; + + unsigned getNumberOfBlocks() const; + + LaneMaskTy getActiveMask() const; + + unsigned getWarpSize() const; + + int32_t shuffle(int32_t Var, uint64_t SrcLane); + + int32_t shuffleDown(int32_t Var, uint32_t Delta); + + void fenceKernel(int32_t MemoryOrder); + + void fenceTeam(); + + void syncWarp(); + + void namedBarrier(bool Generic); + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE); + + void resetBlockEnv(); +}; + +ThreadEnvironmentTy *getThreadEnvironment(void); + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp @@ -0,0 +1,120 @@ +//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of VGPU environment classes. +// +//===----------------------------------------------------------------------===// + +// clang-format off +#include +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include +#include +// clang-format on + +std::mutex AtomicIncLock; + +uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + std::lock_guard G(AtomicIncLock); + uint32_t V = *Address; + if (V >= Val) + *Address = 0; + else + *Address += 1; + return V; +} + +void VGPUImpl::setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, + uint32_t OmpSpin, uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, + int)) { + // TODO: not sure spinning is a good idea here.. + while (atomicCAS((uint32_t *)Lock, Unset, Set, __ATOMIC_SEQ_CST) != Unset) { + std::clock_t start = std::clock(); + std::clock_t now; + for (;;) { + now = std::clock(); + std::clock_t cycles = + now > start ? now - start : now + (0xffffffff - start); + if (cycles >= 1000 * BlockId) { + break; + } + } + } // wait for 0 to be the read value +} + +extern thread_local ThreadEnvironmentTy *ThreadEnvironment; + +ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; } + +ThreadEnvironmentTy::ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : Impl(new VGPUImpl::ThreadEnvironmentTy(Id, WE, CTAE)) {} + +ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; } + +void ThreadEnvironmentTy::fenceTeam() { Impl->fenceTeam(); } + +void ThreadEnvironmentTy::syncWarp() { Impl->syncWarp(); } + +unsigned ThreadEnvironmentTy::getThreadIdInWarp() const { + return Impl->getThreadIdInWarp(); +} + +unsigned ThreadEnvironmentTy::getThreadIdInBlock() const { + return Impl->getThreadIdInBlock(); +} + +unsigned ThreadEnvironmentTy::getGlobalThreadId() const { + return Impl->getGlobalThreadId(); +} + +unsigned ThreadEnvironmentTy::getBlockSize() const { + return Impl->getBlockSize(); +} + +unsigned ThreadEnvironmentTy::getKernelSize() const { + return Impl->getKernelSize(); +} + +unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); } + +unsigned ThreadEnvironmentTy::getNumberOfBlocks() const { + return Impl->getNumberOfBlocks(); +} + +LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { + return Impl->getActiveMask(); +} + +int32_t ThreadEnvironmentTy::shuffle(int32_t Var, uint64_t SrcLane) { + return Impl->shuffle(Var, SrcLane); +} + +int32_t ThreadEnvironmentTy::shuffleDown(int32_t Var, uint32_t Delta) { + return Impl->shuffleDown(Var, Delta); +} + +void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) { + return Impl->fenceKernel(MemoryOrder); +} + +void ThreadEnvironmentTy::namedBarrier(bool Generic) { + Impl->namedBarrier(Generic); +} + +void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + Impl->setBlockEnv(TBE); +} + +void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); } + +unsigned ThreadEnvironmentTy::getWarpSize() const { + return Impl->getWarpSize(); +} diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h @@ -0,0 +1,168 @@ +//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H + +#include "ThreadEnvironment.h" +#include +#include +#include +#include +#include +#include + +class WarpEnvironmentTy { + const unsigned ID; + const unsigned NumThreads; + + std::vector ShuffleBuffer; + + std::barrier> Barrier; + std::barrier> ShuffleBarrier; + std::barrier> ShuffleDownBarrier; + +public: + WarpEnvironmentTy(unsigned ID, unsigned NumThreads) + : ID(ID), NumThreads(NumThreads), ShuffleBuffer(NumThreads), + Barrier(NumThreads, []() {}), ShuffleBarrier(NumThreads, []() {}), + ShuffleDownBarrier(NumThreads, []() {}) {} + + unsigned getWarpId() const { return ID; } + int getNumThreads() const { return NumThreads; } + + void sync() { Barrier.arrive_and_wait(); } + void writeShuffleBuffer(int32_t Var, unsigned LaneId) { + ShuffleBuffer[LaneId] = Var; + } + + int32_t getShuffleBuffer(unsigned LaneId) { return ShuffleBuffer[LaneId]; } + + void waitShuffleBarrier() { ShuffleBarrier.arrive_and_wait(); } + + void waitShuffleDownBarrier() { ShuffleBarrier.arrive_and_wait(); } +}; + +class CTAEnvironmentTy { +public: + unsigned ID; + unsigned NumThreads; + unsigned NumBlocks; + + std::barrier> Barrier; + std::barrier> SyncThreads; + std::barrier> NamedBarrier; + + CTAEnvironmentTy(unsigned ID, unsigned NumThreads, unsigned NumBlocks) + : ID(ID), NumThreads(NumThreads), NumBlocks(NumBlocks), + Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}), + NamedBarrier(NumThreads, []() {}) {} + + unsigned getId() const { return ID; } + unsigned getNumThreads() const { return NumThreads; } + + unsigned getNumBlocks() const { return NumBlocks; } + + void fence() { Barrier.arrive_and_wait(); } + void syncThreads() { SyncThreads.arrive_and_wait(); } + void namedBarrier() { NamedBarrier.arrive_and_wait(); } +}; + +class ThreadBlockEnvironmentTy { + unsigned ID; + unsigned NumBlocks; + +public: + ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks) + : ID(ID), NumBlocks(NumBlocks) {} + + unsigned getId() const { return ID; } + unsigned getNumBlocks() const { return NumBlocks; } +}; + +namespace VGPUImpl { +class ThreadEnvironmentTy { + unsigned ThreadIdInWarp; + unsigned ThreadIdInBlock; + unsigned GlobalThreadIdx; + + WarpEnvironmentTy *WarpEnvironment; + ThreadBlockEnvironmentTy *ThreadBlockEnvironment; + CTAEnvironmentTy *CTAEnvironment; + +public: + ThreadEnvironmentTy(unsigned ThreadId, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : ThreadIdInWarp(ThreadId), + ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadId), + GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() + + ThreadIdInBlock), + WarpEnvironment(WE), CTAEnvironment(CTAE) {} + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + ThreadBlockEnvironment = TBE; + } + + void resetBlockEnv() { + delete ThreadBlockEnvironment; + ThreadBlockEnvironment = nullptr; + } + + unsigned getThreadIdInWarp() const { return ThreadIdInWarp; } + unsigned getThreadIdInBlock() const { return ThreadIdInBlock; } + unsigned getGlobalThreadId() const { return GlobalThreadIdx; } + + unsigned getBlockSize() const { return CTAEnvironment->getNumThreads(); } + + unsigned getBlockId() const { return ThreadBlockEnvironment->getId(); } + + unsigned getNumberOfBlocks() const { + return ThreadBlockEnvironment->getNumBlocks(); + } + unsigned getKernelSize() const {} + + // FIXME: This is wrong + LaneMaskTy getActiveMask() const { return ~0U; } + + void fenceTeam() { CTAEnvironment->fence(); } + void syncWarp() { WarpEnvironment->sync(); } + + int32_t shuffle(int32_t Var, uint64_t SrcLane) { + WarpEnvironment->waitShuffleBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleBarrier(); + Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp); + return Var; + } + + int32_t shuffleDown(int32_t Var, uint32_t Delta) { + WarpEnvironment->waitShuffleDownBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleDownBarrier(); + Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) % + getWarpSize()); + return Var; + } + + void namedBarrier(bool Generic) { + if (Generic) { + CTAEnvironment->namedBarrier(); + } else { + CTAEnvironment->syncThreads(); + } + } + + void fenceKernel(int32_t MemoryOrder) { + std::atomic_thread_fence(static_cast(MemoryOrder)); + } + + unsigned getWarpSize() const { return WarpEnvironment->getNumThreads(); } +}; +} // namespace VGPUImpl + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H diff --git a/openmp/libomptarget/plugins/vgpu/src/rtl.cpp b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp @@ -0,0 +1,623 @@ +//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for virtual (x86) GPU +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#include "elf_common.h" + +#define NUMBER_OF_DEVICES 1 +#define OFFLOADSECTIONNAME "omp_offloading_entries" + +#define DEBUG false + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +thread_local ThreadEnvironmentTy *ThreadEnvironment; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +std::vector CTAEnvironments; +std::vector WarpEnvironments; + +struct VGPUTy { + struct KernelTy { + ffi_cif *Cif; + std::function Kernel; + int NumTeams; + + KernelTy(ffi_cif *Cif, std::function Kernel, int NumTeams) + : Cif(Cif), Kernel(Kernel), NumTeams(NumTeams) {} + }; + + struct VGPUStreamTy { + std::queue Kernels; + std::mutex Mtx; + + void emplace(ffi_cif *Cif, std::function F, int NumTeams) { + std::lock_guard Guard(Mtx); + Kernels.emplace(Cif, F, NumTeams); + } + + KernelTy front() { + std::lock_guard Guard(Mtx); + return Kernels.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Kernels.pop(); + } + + bool empty() { + std::lock_guard Guard(Mtx); + return Kernels.empty(); + } + }; + + struct AsyncInfoQueueTy { + std::deque<__tgt_async_info *> Streams; + std::mutex Mtx; + + bool empty() { + std::lock_guard Guard(Mtx); + return Streams.empty(); + } + + __tgt_async_info *front() { + std::lock_guard Guard(Mtx); + return Streams.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Streams.pop_front(); + } + + void emplace(__tgt_async_info *AsyncInfo) { + std::lock_guard Guard(Mtx); + Streams.emplace_back(AsyncInfo); + } + } ExecutionQueue; + + VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) { + assert(AsyncInfo != nullptr && "async_info ptr was null"); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = new VGPUStreamTy(); + + return reinterpret_cast(AsyncInfo->Queue); + } + + std::atomic Running; + std::vector Threads; + int WarpsPerCTA; + int NumCTAs; + + std::unique_ptr>> Barrier; + std::condition_variable WorkAvailable; + std::mutex WorkDoneMtx; + std::condition_variable WorkDone; + + VGPUTy(int NumThreads = -1, int ThreadsPerWarp = -1, int WarpsPerCTA = -1) + : Running(true) { + if (const char *Env = std::getenv("VGPU_NUM_THREADS")) + NumThreads = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP")) + ThreadsPerWarp = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA")) + WarpsPerCTA = std::stoi(Env); + + if (NumThreads == -1) + NumThreads = std::thread::hardware_concurrency(); + if (ThreadsPerWarp == -1) + ThreadsPerWarp = NumThreads; + if (WarpsPerCTA == -1) + WarpsPerCTA = 1; + + NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA); + + // printf("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n", + // NumThreads, + // ThreadsPerWarp, WarpsPerCTA); + + assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 && + "Invalid VGPU Config"); + + Barrier = std::make_unique>>( + NumThreads, []() {}); + + Threads.reserve(NumThreads); + + auto GlobalThreadIdx = 0; + for (auto CTAIdx = 0; CTAIdx < NumCTAs; CTAIdx++) { + auto *CTAEnv = + new CTAEnvironmentTy(CTAIdx, NumThreads / NumCTAs, NumCTAs); + for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) { + auto *WarpEnv = new WarpEnvironmentTy(WarpIdx, ThreadsPerWarp); + for (auto ThreadIdx = 0; ThreadIdx < ThreadsPerWarp; ThreadIdx++) { + Threads.emplace_back( + [this, ThreadIdx, GlobalThreadIdx, CTAEnv, WarpEnv]() { + ThreadEnvironment = + new ThreadEnvironmentTy(ThreadIdx, WarpEnv, CTAEnv); + std::function Kernel; + while (Running) { + { + std::unique_lock UniqueLock(ExecutionQueue.Mtx); + + WorkAvailable.wait(UniqueLock, [&]() { + if (!Running) { + return true; + } + bool IsEmpty = ExecutionQueue.Streams.empty(); + + return !IsEmpty; + }); + } + + if (ExecutionQueue.empty()) { + continue; + } + + while (!ExecutionQueue.empty()) { + auto *Stream = getStream(ExecutionQueue.front()); + while (!Stream->empty()) { + auto KernelInfo = Stream->front(); + Kernel = KernelInfo.Kernel; + + const unsigned NumTeams = KernelInfo.NumTeams; + unsigned TeamIdx = 0; + while (TeamIdx < KernelInfo.NumTeams) { + if (CTAEnv->getId() < KernelInfo.NumTeams) { + ThreadEnvironment->setBlockEnv( + new ThreadBlockEnvironmentTy( + TeamIdx + CTAEnv->getId(), NumTeams)); + Kernel(); + ThreadEnvironment->resetBlockEnv(); + } + Barrier->arrive_and_wait(); + TeamIdx += NumCTAs; + } + + if (GlobalThreadIdx == 0) { + delete KernelInfo.Cif; + Stream->pop(); + } + + Barrier->arrive_and_wait(); + } + if (GlobalThreadIdx == 0) { + ExecutionQueue.pop(); + WorkDone.notify_all(); + } + Barrier->arrive_and_wait(); + } + } + delete ThreadEnvironment; + }); + GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads; + } + WarpEnvironments.push_back(WarpEnv); + } + CTAEnvironments.push_back(CTAEnv); + } + } + + ~VGPUTy() { + awaitAll(); + + Running = false; + WorkAvailable.notify_all(); + + for (auto &Thread : Threads) { + if (Thread.joinable()) { + Thread.join(); + } + } + + for (auto *CTAEnv : CTAEnvironments) + delete CTAEnv; + + for (auto *WarpEnv : WarpEnvironments) + delete WarpEnv; + } + + void await(__tgt_async_info *AsyncInfo) { + std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx); + WorkDone.wait(UniqueLock, + [&]() { return getStream(AsyncInfo)->Kernels.empty(); }); + } + + void awaitAll() { + while (!ExecutionQueue.empty()) { + await(ExecutionQueue.front()); + } + } + + void scheduleAsync(__tgt_async_info *AsyncInfo, ffi_cif *Cif, + std::function F, int NumTeams) { + if (NumTeams == 0) + NumTeams = NumCTAs; + auto *Stream = getStream(AsyncInfo); + Stream->emplace(Cif, F, NumTeams); + ExecutionQueue.emplace(AsyncInfo); + WorkAvailable.notify_all(); + } +}; + +VGPUTy VGPU; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid + // the dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +// Sample implementation of explicit memory allocator. For this plugin all +// kinds are equivalent to each other. +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind) { + void *ptr = NULL; + + switch (kind) { + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEFAULT: + ptr = malloc(size); + break; + default: + REPORT("Invalid target data allocation kind"); + } + + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + VGPU.await(async_info); + delete (VGPUTy::VGPUStreamTy *)async_info->Queue; + async_info->Queue = nullptr; + return 0; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount) { + __tgt_async_info AsyncInfo; + int rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &AsyncInfo); + + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info) { + ffi_cif *cif = new ffi_cif(); + + // All args are references. + std::shared_ptr> args_types = + std::make_shared>(arg_num, &ffi_type_pointer); + std::shared_ptr> args = + std::make_shared>(arg_num); + std::shared_ptr> ptrs = + std::make_shared>(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + (*ptrs)[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + (*args)[i] = &(*ptrs)[i]; + } + + ffi_status status = ffi_prep_cif(cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &(*args_types)[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + void (*entry)(void); + *((void **)&entry) = tgt_entry_ptr; + + VGPU.scheduleAsync( + async_info, cif, + [&]() { + ffi_call(cif, entry, NULL, &(*args)[0]); + &(args_types); + }, + team_num); + VGPU.await(async_info); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { + return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr, + tgt_args, tgt_offsets, arg_num, + 1, 1, 0, async_info); +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -30,6 +30,7 @@ /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", /* Remote target */ "libomptarget.rtl.rpc.so", + /* Virtual GPU target */ "libomptarget.rtl.vgpu.so", }; PluginManager *PM; @@ -79,7 +80,13 @@ // is correct and if they are supporting any devices. for (auto *Name : RTLNames) { DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); + + int Flags = RTLD_NOW; + + if (strcmp(Name, "libomptarget.rtl.vgpu.so") == 0) + Flags |= RTLD_GLOBAL; + + void *dynlib_handle = dlopen(Name, Flags); if (!dynlib_handle) { // Library does not exist or cannot be found.