diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -151,6 +151,9 @@ MaxOpenCLWorkGroupSize = 1024; ProgramAddrSpace = 0; + + if (Triple.getVendor() == llvm::Triple::OpenMP_VGPU) + AddrSpaceMap = &llvm::omp::OpenMPVGPUAddrSpaceMap; } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -17,6 +17,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/Triple.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/X86TargetParser.h" @@ -388,6 +389,10 @@ uint64_t getPointerAlignV(unsigned AddrSpace) const override { return getPointerWidthV(AddrSpace); } + + const llvm::omp::GV &getGridValue() const override { + return llvm::omp::VirtualGpuGridValues; + } }; // X86-32 generic target diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1119,10 +1119,11 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, - llvm::Constant *Addr, - uint64_t Size, int32_t, - llvm::GlobalValue::LinkageTypes) { +void CGOpenMPRuntimeGPU::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) { + if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) + return CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage); // TODO: Add support for global variables on the device after declare target // support. llvm::Function *Fn = dyn_cast(Addr); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -249,7 +249,9 @@ OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: - if (LangOpts.OpenMPSimd) + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) + OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); + else if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); else OpenMPRuntime.reset(new CGOpenMPRuntime(*this)); diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -3069,4 +3069,13 @@ if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); + + if (DriverArgs.hasArg(options::OPT_S)) + return; + + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) { + std::string BitcodeSuffix = getTripleString() + "-openmp_vgpu"; + clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, + BitcodeSuffix, getTriple()); + } } diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3985,7 +3985,8 @@ } // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options - Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && + (T.isNVPTX() || T.isAMDGCN() || T.isOpenMPVGPU()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -165,7 +165,8 @@ Mesa, SUSE, OpenEmbedded, - LastVendorType = OpenEmbedded + OpenMP_VGPU, + LastVendorType = OpenMP_VGPU }; enum OSType { UnknownOS, @@ -691,6 +692,9 @@ return getArch() == Triple::nvptx || getArch() == Triple::nvptx64; } + /// Tests whether the target is OpenMP VGPU. + bool isOpenMPVGPU() const { return getVendor() == llvm::Triple::OpenMP_VGPU; } + /// Tests whether the target is AMDGCN bool isAMDGCN() const { return getArch() == Triple::amdgcn; } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -114,6 +114,38 @@ 128, // GV_Default_WG_Size }; +/// For Virtual GPUs +static constexpr GV VirtualGpuGridValues = { + 256, // GV_Slot_Size + 32, // GV_Warp_Size + 1024, // GV_Max_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size + 128, // GV_Defaut_WG_Size +}; + +static const unsigned OpenMPVGPUAddrSpaceMap[] = { + 0, // Default + 1, // opencl_global + 3, // opencl_local + 4, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 1, // opencl_global_device + 1, // opencl_global_host + 1, // cuda_device + 4, // cuda_constant + 3, // cuda_shared + 1, // sycl_global + 0, // sycl_global_device + 0, // sycl_global_host + 3, // sycl_local + 0, // sycl_private + 270, // ptr32_sptr + 271, // ptr32_uptr + 272 // ptr64 +}; + } // namespace omp } // namespace llvm diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -185,6 +185,8 @@ case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case OpenMP_VGPU: + return "openmp_vgpu"; } llvm_unreachable("Invalid VendorType!"); @@ -492,22 +494,23 @@ static Triple::VendorType parseVendor(StringRef VendorName) { return StringSwitch(VendorName) - .Case("apple", Triple::Apple) - .Case("pc", Triple::PC) - .Case("scei", Triple::SCEI) - .Case("sie", Triple::SCEI) - .Case("fsl", Triple::Freescale) - .Case("ibm", Triple::IBM) - .Case("img", Triple::ImaginationTechnologies) - .Case("mti", Triple::MipsTechnologies) - .Case("nvidia", Triple::NVIDIA) - .Case("csr", Triple::CSR) - .Case("myriad", Triple::Myriad) - .Case("amd", Triple::AMD) - .Case("mesa", Triple::Mesa) - .Case("suse", Triple::SUSE) - .Case("oe", Triple::OpenEmbedded) - .Default(Triple::UnknownVendor); + .Case("apple", Triple::Apple) + .Case("pc", Triple::PC) + .Case("scei", Triple::SCEI) + .Case("sie", Triple::SCEI) + .Case("fsl", Triple::Freescale) + .Case("ibm", Triple::IBM) + .Case("img", Triple::ImaginationTechnologies) + .Case("mti", Triple::MipsTechnologies) + .Case("nvidia", Triple::NVIDIA) + .Case("csr", Triple::CSR) + .Case("myriad", Triple::Myriad) + .Case("amd", Triple::AMD) + .Case("mesa", Triple::Mesa) + .Case("suse", Triple::SUSE) + .Case("oe", Triple::OpenEmbedded) + .Case("openmp_vgpu", Triple::OpenMP_VGPU) + .Default(Triple::UnknownVendor); } static Triple::OSType parseOS(StringRef OSName) { diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -44,6 +44,8 @@ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) endif() + + list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include) endif() # Check and set up common compiler flags. diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -157,9 +157,8 @@ add_custom_command(OUTPUT ${outfile} COMMAND ${CLANG_TOOL} - ${bc_flags} - -Xclang -target-cpu -Xclang ${target_cpu} ${target_bc_flags} + ${bc_flags} ${infile} -o ${outfile} DEPENDS ${infile} IMPLICIT_DEPENDS CXX ${infile} @@ -227,9 +226,11 @@ # Generate a Bitcode library for all the compute capabilities the user requested foreach(sm ${nvptx_sm_list}) - compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64-nvidia-cuda -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") + compileDeviceRTLLibrary(sm_${sm} nvptx -Xclang -target-cpu -Xclang sm_${sm} -target nvptx64-nvidia-cuda -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") endforeach() foreach(mcpu ${amdgpu_mcpus}) - compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib) + compileDeviceRTLLibrary(${mcpu} amdgpu -Xclang -target-cpu -Xclang ${mcpu} -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib) endforeach() + +compileDeviceRTLLibrary(x86_64 vgpu -target x86_64-vgpu -std=c++20 -I${devicertl_base_directory}/../plugins/vgpu/src) diff --git a/openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h b/openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h @@ -0,0 +1,11 @@ +//===--- ThreadEnvironment.h - OpenMP VGPU Dummy Header File ------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Dummy header file to avoid preprocessor errors in device runtime. +// +//===----------------------------------------------------------------------===// diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -49,6 +49,15 @@ } // namespace impl #pragma omp end declare variant +#pragma omp begin declare variant match(device = {kind(cpu)}) +int32_t vprintf(const char *, void *); +namespace impl { +static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return vprintf(Format, Arguments); +} +} // namespace impl +#pragma omp end declare variant + int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) { return impl::omp_vprintf(Format, Arguments, Size); } diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -17,10 +17,85 @@ #pragma omp declare target +#include "ThreadEnvironment.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace _OMP; +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match(device = {kind(cpu)}) + +namespace _OMP { +namespace impl { + +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::VirtualGpuGridValues; +} + +LaneMaskTy activemask() { + uint64_t B = 0; + uint32_t N = mapping::getWarpSize(); + while (N) + B |= (1 << (--N)); + return B; +} + +LaneMaskTy lanemaskLT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { + return getThreadEnvironment()->getThreadIdInBlock(); +} + +uint32_t getNumHardwareThreadsInBlock() { + return getThreadEnvironment()->getBlockSize(); +} + +uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); } + +uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); } + +uint32_t getNumberOfBlocks() { + return getThreadEnvironment()->getNumberOfBlocks(); +} + +uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); } + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} + +} // namespace impl +} // namespace _OMP + +#pragma omp end declare variant + namespace _OMP { namespace impl { diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -18,10 +18,9 @@ namespace _OMP { namespace impl { -/// AMDGCN Implementation +/// Generic Implementation - AMDGCN, VGPU /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) double getWTick() { return ((double)1E-9); } @@ -33,8 +32,6 @@ return 0; } -#pragma omp end declare variant - /// NVPTX Implementation /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -16,6 +16,7 @@ #include "Interface.h" #include "Mapping.h" #include "State.h" +#include "ThreadEnvironment.h" #include "Types.h" #include "Utils.h" @@ -283,6 +284,64 @@ } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match(device = {kind(cpu)}) + +namespace impl { + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return VGPUImpl::atomicInc(Address, Val, Ordering); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % mapping::getWarpSize() == 0); + getThreadEnvironment()->namedBarrier(true); +} + +void fenceTeam(int Ordering) { getThreadEnvironment()->fenceTeam(Ordering); } + +void fenceKernel(int Ordering) { + getThreadEnvironment()->fenceKernel(Ordering); +} + +// Simply call fenceKernel because there is no need to sync with host +void fenceSystem(int Ordering) { fenceKernel(Ordering); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { + getThreadEnvironment()->syncWarp(Mask); +} + +void syncThreads() { getThreadEnvironment()->namedBarrier(false); } + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { VGPUImpl::unsetLock((uint32_t *)Lock); } + +int testLock(omp_lock_t *Lock) { return VGPUImpl::testLock((uint32_t *)Lock); } + +void initLock(omp_lock_t *Lock) { VGPUImpl::initLock((uint32_t *)Lock); } + +void destroyLock(omp_lock_t *Lock) { VGPUImpl::destroyLock((uint32_t *)Lock); } + +void setLock(omp_lock_t *Lock) { VGPUImpl::setLock((uint32_t *)Lock); } + +void syncThreadsAligned() {} + +} // namespace impl + +#pragma omp end declare variant +///} + void synchronize::init(bool IsSPMD) { if (!IsSPMD) impl::namedBarrierInit(); diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -14,6 +14,7 @@ #include "Debug.h" #include "Interface.h" #include "Mapping.h" +#include "ThreadEnvironment.h" #pragma omp declare target @@ -32,10 +33,9 @@ namespace impl { -/// AMDGCN Implementation +/// AMDGCN/Generic Implementation /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { static_assert(sizeof(unsigned long) == 8, ""); @@ -47,8 +47,6 @@ return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } -#pragma omp end declare variant - /// NVPTX Implementation /// ///{ @@ -113,6 +111,24 @@ #pragma omp end declare variant } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match(device = {kind(cpu)}) + +namespace impl { + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return getThreadEnvironment()->shuffle(Mask, Var, SrcLane); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + return getThreadEnvironment()->shuffleDown(Mask, Var, Delta); +} + +} // namespace impl +#pragma omp end declare variant + uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { return impl::Pack(LowBits, HighBits); } diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -75,6 +75,7 @@ add_subdirectory(ppc64) add_subdirectory(ppc64le) add_subdirectory(ve) +add_subdirectory(vgpu) add_subdirectory(x86_64) add_subdirectory(remote) diff --git a/openmp/libomptarget/plugins/vgpu/CMakeLists.txt b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/CMakeLists.txt @@ -0,0 +1,74 @@ +###===----------------------------------------------------------------------===## +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the VGPU plugin for virtual GPU offloading. +# +##===----------------------------------------------------------------------===# + +if (NOT(LIBOMPTARGET_ENABLE_EXPERIMENTAL_VGPU_PLUGIN)) + return() +endif() + +macro(build_generic_elf64_vgpu tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Building ${tmachine_triple}-${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironment.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironmentImpl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20) + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + elf_common + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + # ${OPENMP_PTHREAD_LIB} + "-rdynamic" + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ) + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS + "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) +else() + libomptarget_say("Not building ${tmachine_name}-vgpu offloading plugin: machine not found in the system.") +endif() +endmacro() + +build_generic_elf64_vgpu("x86_64" "vgpu" "vgpu" "x86_64-vgpu" "62") + diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h @@ -0,0 +1,73 @@ +//===---- ThreadEnvironment.h - Virtual GPU thread environment ----- C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H + +using LaneMaskTy = uint64_t; + +// Forward declaration +class WarpEnvironmentTy; +class ThreadBlockEnvironmentTy; +class CTAEnvironmentTy; +namespace VGPUImpl { +class ThreadEnvironmentTy; +void initLock(uint32_t *Lock); +void destroyLock(uint32_t *Lock); +void setLock(uint32_t *Lock); +void unsetLock(uint32_t *Lock); +bool testLock(uint32_t *Lock); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); +} // namespace VGPUImpl + +class ThreadEnvironmentTy { + VGPUImpl::ThreadEnvironmentTy *Impl; + +public: + ThreadEnvironmentTy(WarpEnvironmentTy *WE, CTAEnvironmentTy *CTAE); + + ~ThreadEnvironmentTy(); + + unsigned getThreadIdInWarp() const; + + unsigned getThreadIdInBlock() const; + + unsigned getGlobalThreadId() const; + + unsigned getBlockSize() const; + + unsigned getKernelSize() const; + + unsigned getBlockId() const; + + unsigned getNumberOfBlocks() const; + + LaneMaskTy getActiveMask() const; + + unsigned getWarpSize() const; + + int32_t shuffle(uint64_t Mask, int32_t Var, uint64_t SrcLane); + + int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta); + + void fenceKernel(int32_t MemoryOrder); + + void fenceTeam(int MemoryOrder); + + void syncWarp(int Mask); + + void namedBarrier(bool Generic); + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE); + + void resetBlockEnv(); +}; + +ThreadEnvironmentTy *getThreadEnvironment(void); + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp @@ -0,0 +1,117 @@ +//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of VGPU environment classes. +// +//===----------------------------------------------------------------------===// +// +#include + +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include +#include + +std::mutex AtomicIncLock; + +uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + std::lock_guard G(AtomicIncLock); + uint32_t V = *Address; + if (V >= Val) + *Address = 0; + else + *Address += 1; + return V; +} + +void VGPUImpl::initLock(uint32_t *Lock) { Lock = (uint32_t *)new std::mutex; } + +void VGPUImpl::destroyLock(uint32_t *Lock) { + std::mutex *Mtx = (std::mutex *)Lock; + delete Mtx; +} + +void VGPUImpl::setLock(uint32_t *Lock) { ((std::mutex *)Lock)->lock(); } + +void VGPUImpl::unsetLock(uint32_t *Lock) { ((std::mutex *)Lock)->unlock(); } + +bool VGPUImpl::testLock(uint32_t *Lock) { + return ((std::mutex *)Lock)->try_lock(); +} + +extern thread_local ThreadEnvironmentTy *ThreadEnvironment; + +ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; } + +ThreadEnvironmentTy::ThreadEnvironmentTy(WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : Impl(new VGPUImpl::ThreadEnvironmentTy(WE, CTAE)) {} + +ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; } + +void ThreadEnvironmentTy::fenceTeam(int Ordering) { Impl->fenceTeam(Ordering); } + +void ThreadEnvironmentTy::syncWarp(int Ordering) { Impl->syncWarp(Ordering); } + +unsigned ThreadEnvironmentTy::getThreadIdInWarp() const { + return Impl->getThreadIdInWarp(); +} + +unsigned ThreadEnvironmentTy::getThreadIdInBlock() const { + return Impl->getThreadIdInBlock(); +} + +unsigned ThreadEnvironmentTy::getGlobalThreadId() const { + return Impl->getGlobalThreadId(); +} + +unsigned ThreadEnvironmentTy::getBlockSize() const { + return Impl->getBlockSize(); +} + +unsigned ThreadEnvironmentTy::getKernelSize() const { + return Impl->getKernelSize(); +} + +unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); } + +unsigned ThreadEnvironmentTy::getNumberOfBlocks() const { + return Impl->getNumberOfBlocks(); +} + +LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { + return Impl->getActiveMask(); +} + +int32_t ThreadEnvironmentTy::shuffle(uint64_t Mask, int32_t Var, + uint64_t SrcLane) { + return Impl->shuffle(Mask, Var, SrcLane); +} + +int32_t ThreadEnvironmentTy::shuffleDown(uint64_t Mask, int32_t Var, + uint32_t Delta) { + return Impl->shuffleDown(Mask, Var, Delta); +} + +void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) { + return Impl->fenceKernel(MemoryOrder); +} + +void ThreadEnvironmentTy::namedBarrier(bool Generic) { + Impl->namedBarrier(Generic); +} + +void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + Impl->setBlockEnv(TBE); +} + +void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); } + +unsigned ThreadEnvironmentTy::getWarpSize() const { + return Impl->getWarpSize(); +} diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h @@ -0,0 +1,137 @@ +//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H + +#include "ThreadEnvironment.h" +#include +#include +#include +#include +#include +#include + +using BarrierTy = std::barrier>; + +class WarpEnvironmentTy { + static unsigned Idx; + + const unsigned ID; + + std::vector ShuffleBuffer; + + BarrierTy Barrier; + BarrierTy ShuffleBarrier; + BarrierTy ShuffleDownBarrier; + +public: + static void configure(unsigned NumThreadsInWarp); + + static unsigned ThreadsPerWarp; + + WarpEnvironmentTy(); + + unsigned getWarpId() const; + int getNumThreads() const; + + void sync(int Ordering); + void writeShuffleBuffer(int32_t Var, unsigned LaneId); + + int32_t getShuffleBuffer(unsigned LaneId); + + void waitShuffleBarrier(); + void waitShuffleDownBarrier(); +}; + +class CTAEnvironmentTy { + static unsigned Idx; + +public: + unsigned ID; + static unsigned NumThreads; + static unsigned NumCTAs; + + BarrierTy Barrier; + BarrierTy SyncThreads; + BarrierTy NamedBarrier; + + static void configure(unsigned TotalNumThreads, unsigned NumBlocksInCTA); + + CTAEnvironmentTy(); + + unsigned getId() const; + unsigned getNumThreads() const; + + unsigned getNumBlocks() const; + + void fence(int Ordering); + void syncThreads(); + void namedBarrier(); +}; + +class ThreadBlockEnvironmentTy { + unsigned ID; + unsigned NumBlocks; + +public: + ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks); + + unsigned getId() const; + unsigned getNumBlocks() const; +}; + +namespace VGPUImpl { +class ThreadEnvironmentTy { + static unsigned Idx; + unsigned ThreadIdInWarp; + unsigned ThreadIdInBlock; + unsigned GlobalThreadIdx; + + WarpEnvironmentTy *WarpEnvironment; + ThreadBlockEnvironmentTy *ThreadBlockEnvironment; + CTAEnvironmentTy *CTAEnvironment; + +public: + ThreadEnvironmentTy(WarpEnvironmentTy *WE, CTAEnvironmentTy *CTAE); + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE); + + void resetBlockEnv(); + + unsigned getThreadIdInWarp() const; + unsigned getThreadIdInBlock() const; + unsigned getGlobalThreadId() const; + + unsigned getBlockSize() const; + + unsigned getBlockId() const; + + unsigned getNumberOfBlocks() const; + unsigned getKernelSize() const; + + // FIXME: This is wrong + LaneMaskTy getActiveMask() const; + + void fenceTeam(int Ordering); + void syncWarp(int Ordering); + + int32_t shuffle(uint64_t Mask, int32_t Var, uint64_t SrcLane); + + int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta); + + void namedBarrier(bool Generic); + + void fenceKernel(int32_t MemoryOrder); + + unsigned getWarpSize() const; +}; + +} // namespace VGPUImpl + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H diff --git a/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp @@ -0,0 +1,171 @@ +//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "ThreadEnvironmentImpl.h" +#include +#include +#include +#include +#include +#include + +void WarpEnvironmentTy::configure(unsigned NumThreads) { + ThreadsPerWarp = NumThreads; +} + +WarpEnvironmentTy::WarpEnvironmentTy() + : ID(Idx++), ShuffleBuffer(ThreadsPerWarp), + Barrier(ThreadsPerWarp, []() {}), ShuffleBarrier(ThreadsPerWarp, []() {}), + ShuffleDownBarrier(ThreadsPerWarp, []() {}) {} + +unsigned WarpEnvironmentTy::getWarpId() const { return ID; } + +int WarpEnvironmentTy::getNumThreads() const { return ThreadsPerWarp; } + +void WarpEnvironmentTy::sync(int Ordering) { Barrier.arrive_and_wait(); } + +void WarpEnvironmentTy::writeShuffleBuffer(int32_t Var, unsigned LaneId) { + ShuffleBuffer[LaneId] = Var; +} + +int32_t WarpEnvironmentTy::getShuffleBuffer(unsigned LaneId) { + return ShuffleBuffer[LaneId]; +} + +void WarpEnvironmentTy::waitShuffleBarrier() { + ShuffleBarrier.arrive_and_wait(); +} + +void WarpEnvironmentTy::waitShuffleDownBarrier() { + ShuffleBarrier.arrive_and_wait(); +} + +unsigned WarpEnvironmentTy::Idx = 0; +unsigned WarpEnvironmentTy::ThreadsPerWarp = 0; + +void CTAEnvironmentTy::configure(unsigned TotalNumThreads, unsigned NumBlocks) { + NumThreads = TotalNumThreads / NumBlocks; + NumCTAs = NumBlocks; +} + +CTAEnvironmentTy::CTAEnvironmentTy() + : ID(Idx++), Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}), + NamedBarrier(NumThreads, []() {}) {} + +unsigned CTAEnvironmentTy::getId() const { return ID; } +unsigned CTAEnvironmentTy::getNumThreads() const { return NumThreads; } + +unsigned CTAEnvironmentTy::getNumBlocks() const { return NumCTAs; } + +void CTAEnvironmentTy::fence(int Ordering) { Barrier.arrive_and_wait(); } +void CTAEnvironmentTy::syncThreads() { SyncThreads.arrive_and_wait(); } +void CTAEnvironmentTy::namedBarrier() { NamedBarrier.arrive_and_wait(); } + +unsigned CTAEnvironmentTy::Idx = 0; +unsigned CTAEnvironmentTy::NumThreads = 0; +unsigned CTAEnvironmentTy::NumCTAs = 0; + +ThreadBlockEnvironmentTy::ThreadBlockEnvironmentTy(unsigned ID, + unsigned NumBlocks) + : ID(ID), NumBlocks(NumBlocks) {} + +unsigned ThreadBlockEnvironmentTy::getId() const { return ID; } +unsigned ThreadBlockEnvironmentTy::getNumBlocks() const { return NumBlocks; } + +namespace VGPUImpl { +ThreadEnvironmentTy::ThreadEnvironmentTy(WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : ThreadIdInWarp(Idx++ % WE->getNumThreads()), + ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadIdInWarp), + GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() + ThreadIdInBlock), + WarpEnvironment(WE), CTAEnvironment(CTAE) {} + +void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + ThreadBlockEnvironment = TBE; +} + +void ThreadEnvironmentTy::resetBlockEnv() { + delete ThreadBlockEnvironment; + ThreadBlockEnvironment = nullptr; +} + +unsigned ThreadEnvironmentTy::getThreadIdInWarp() const { + return ThreadIdInWarp; +} +unsigned ThreadEnvironmentTy::getThreadIdInBlock() const { + return ThreadIdInBlock; +} +unsigned ThreadEnvironmentTy::getGlobalThreadId() const { + return GlobalThreadIdx; +} + +unsigned ThreadEnvironmentTy::getBlockSize() const { + return CTAEnvironment->getNumThreads(); +} + +unsigned ThreadEnvironmentTy::getBlockId() const { + return ThreadBlockEnvironment->getId(); +} + +unsigned ThreadEnvironmentTy::getNumberOfBlocks() const { + return ThreadBlockEnvironment->getNumBlocks(); +} +unsigned ThreadEnvironmentTy::getKernelSize() const { + return getBlockSize() * getNumberOfBlocks(); +} + +// FIXME: This is wrong +LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { return ~0U; } + +void ThreadEnvironmentTy::fenceTeam(int Ordering) { + CTAEnvironment->fence(Ordering); +} +void ThreadEnvironmentTy::syncWarp(int Ordering) { + WarpEnvironment->sync(Ordering); +} + +int32_t ThreadEnvironmentTy::shuffle(uint64_t Mask, int32_t Var, + uint64_t SrcLane) { + WarpEnvironment->waitShuffleBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleBarrier(); + Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp); + return Var; +} + +int32_t ThreadEnvironmentTy::shuffleDown(uint64_t Mask, int32_t Var, + uint32_t Delta) { + WarpEnvironment->waitShuffleDownBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleDownBarrier(); + Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) % + getWarpSize()); + return Var; +} + +void ThreadEnvironmentTy::namedBarrier(bool Generic) { + if (Generic) { + CTAEnvironment->namedBarrier(); + } else { + CTAEnvironment->syncThreads(); + } +} + +void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) { + std::atomic_thread_fence(static_cast(MemoryOrder)); +} + +unsigned ThreadEnvironmentTy::getWarpSize() const { + return WarpEnvironment->getNumThreads(); +} + +unsigned ThreadEnvironmentTy::Idx = 0; + +} // namespace VGPUImpl diff --git a/openmp/libomptarget/plugins/vgpu/src/rtl.cpp b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/vgpu/src/rtl.cpp @@ -0,0 +1,615 @@ +//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for virtual (x86) GPU +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#include "elf_common.h" + +#define OFFLOADSECTIONNAME "omp_offloading_entries" + +#define DEBUG false + +struct FFICallTy { + ffi_cif CIF; + std::vector ArgsTypes; + std::vector Args; + std::vector Ptrs; + void (*Entry)(void); + + FFICallTy(int32_t ArgNum, void **TgtArgs, ptrdiff_t *TgtOffsets, + void *TgtEntryPtr) + : ArgsTypes(ArgNum, &ffi_type_pointer), Args(ArgNum), Ptrs(ArgNum) { + for (int32_t i = 0; i < ArgNum; ++i) { + Ptrs[i] = (void *)((intptr_t)TgtArgs[i] + TgtOffsets[i]); + Args[i] = &Ptrs[i]; + } + + ffi_status status = ffi_prep_cif(&CIF, FFI_DEFAULT_ABI, ArgNum, + &ffi_type_void, &ArgsTypes[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + *((void **)&Entry) = TgtEntryPtr; + } +}; + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +thread_local ThreadEnvironmentTy *ThreadEnvironment; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy() : FuncGblEntries(1) {} + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +std::vector CTAEnvironments; +std::vector WarpEnvironments; + +struct VGPUTy { + struct KernelTy { + FFICallTy *Call; + int NumTeams; + + KernelTy(FFICallTy *Call, int NumTeams) : Call(Call), NumTeams(NumTeams) {} + }; + + struct VGPUStreamTy { + std::queue Kernels; + std::mutex Mtx; + + void emplace(FFICallTy *Call, int NumTeams) { + std::lock_guard Guard(Mtx); + Kernels.emplace(Call, NumTeams); + } + + KernelTy front() { + std::lock_guard Guard(Mtx); + return Kernels.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Kernels.pop(); + } + + bool empty() { + std::lock_guard Guard(Mtx); + return Kernels.empty(); + } + }; + + struct AsyncInfoQueueTy { + std::deque<__tgt_async_info *> Streams; + std::mutex Mtx; + + bool empty() { + std::lock_guard Guard(Mtx); + return Streams.empty(); + } + + __tgt_async_info *front() { + std::lock_guard Guard(Mtx); + return Streams.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Streams.pop_front(); + } + + void emplace(__tgt_async_info *AsyncInfo) { + std::lock_guard Guard(Mtx); + Streams.emplace_back(AsyncInfo); + } + } ExecutionQueue; + + VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) { + assert(AsyncInfo != nullptr && "async_info ptr was null"); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = new VGPUStreamTy(); + + return reinterpret_cast(AsyncInfo->Queue); + } + + std::atomic Running; + std::vector Threads; + int WarpsPerCTA = -1; + int NumCTAs = -1; + int NumThreads = -1; + + std::unique_ptr>> Barrier; + std::condition_variable WorkAvailable; + std::mutex WorkDoneMtx; + std::condition_variable WorkDone; + + void configureArchitecture() { + int ThreadsPerWarp = -1; + + if (const char *Env = std::getenv("VGPU_NUM_THREADS")) + NumThreads = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP")) + ThreadsPerWarp = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA")) + WarpsPerCTA = std::stoi(Env); + + if (NumThreads == -1) + NumThreads = std::thread::hardware_concurrency(); + if (ThreadsPerWarp == -1) + ThreadsPerWarp = NumThreads; + if (WarpsPerCTA == -1) + WarpsPerCTA = 1; + + NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA); + + assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 && + "Invalid VGPU Config"); + + DP("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n", NumThreads, + ThreadsPerWarp, WarpsPerCTA); + + CTAEnvironmentTy::configure(NumThreads, NumCTAs); + WarpEnvironmentTy::configure(ThreadsPerWarp); + } + + VGPUTy() : Running(true) { + configureArchitecture(); + + Barrier = std::make_unique(NumThreads, []() {}); + Threads.reserve(NumThreads); + + auto GlobalThreadIdx = 0; + for (auto CTAIdx = 0; CTAIdx < CTAEnvironmentTy::NumCTAs; CTAIdx++) { + auto *CTAEnv = new CTAEnvironmentTy(); + for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) { + auto *WarpEnv = new WarpEnvironmentTy(); + for (auto ThreadIdx = 0; ThreadIdx < WarpEnvironmentTy::ThreadsPerWarp; + ThreadIdx++) { + Threads.emplace_back([this, GlobalThreadIdx, CTAEnv, WarpEnv]() { + ThreadEnvironment = new ThreadEnvironmentTy(WarpEnv, CTAEnv); + while (Running) { + { + std::unique_lock UniqueLock(ExecutionQueue.Mtx); + + WorkAvailable.wait(UniqueLock, [&]() { + if (!Running) + return true; + + bool IsEmpty = ExecutionQueue.Streams.empty(); + + return !IsEmpty; + }); + } + + if (ExecutionQueue.empty()) + continue; + + while (!ExecutionQueue.empty()) { + auto *Stream = getStream(ExecutionQueue.front()); + while (!Stream->empty()) { + auto [Call, NumTeams] = Stream->front(); + + runKernel(CTAEnv, Call, NumTeams); + + if (GlobalThreadIdx == 0) { + Stream->pop(); + delete Call; + } + + Barrier->arrive_and_wait(); + } + if (GlobalThreadIdx == 0) { + ExecutionQueue.pop(); + WorkDone.notify_all(); + } + Barrier->arrive_and_wait(); + } + } + delete ThreadEnvironment; + }); + GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads; + } + WarpEnvironments.push_back(WarpEnv); + } + CTAEnvironments.push_back(CTAEnv); + } + } + + void runKernel(CTAEnvironmentTy *CTAEnv, FFICallTy *Call, int NumTeams) { + unsigned TeamIdx = 0; + while (TeamIdx < NumTeams) { + if (CTAEnv->getId() < NumTeams) { + ThreadEnvironment->setBlockEnv( + new ThreadBlockEnvironmentTy(TeamIdx + CTAEnv->getId(), NumTeams)); + ffi_call(&Call->CIF, Call->Entry, NULL, &(Call->Args)[0]); + ThreadEnvironment->resetBlockEnv(); + } + Barrier->arrive_and_wait(); + TeamIdx += NumCTAs; + } + } + + ~VGPUTy() { + awaitAll(); + + Running = false; + WorkAvailable.notify_all(); + + for (auto &Thread : Threads) { + if (Thread.joinable()) + Thread.join(); + } + + for (auto *CTAEnv : CTAEnvironments) + delete CTAEnv; + + for (auto *WarpEnv : WarpEnvironments) + delete WarpEnv; + } + + void await(__tgt_async_info *AsyncInfo) { + std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx); + WorkDone.wait(UniqueLock, + [&]() { return getStream(AsyncInfo)->Kernels.empty(); }); + } + + void awaitAll() { + while (!ExecutionQueue.empty()) { + await(ExecutionQueue.front()); + } + } + + void scheduleAsync(__tgt_async_info *AsyncInfo, FFICallTy *Call, + int NumTeams) { + if (NumTeams == 0) + NumTeams = NumCTAs; + auto *Stream = getStream(AsyncInfo); + Stream->emplace(Call, NumTeams); + ExecutionQueue.emplace(AsyncInfo); + WorkAvailable.notify_all(); + } +}; + +VGPUTy VGPU; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return 1; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < 1 && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid + // the dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +// Sample implementation of explicit memory allocator. For this plugin all +// kinds are equivalent to each other. +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind) { + void *ptr = NULL; + + switch (kind) { + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEFAULT: + ptr = malloc(size); + break; + default: + REPORT("Invalid target data allocation kind"); + } + + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + VGPU.await(async_info); + delete (VGPUTy::VGPUStreamTy *)async_info->Queue; + async_info->Queue = nullptr; + return 0; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount) { + __tgt_async_info AsyncInfo; + int rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &AsyncInfo); + + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info) { + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + auto Call = new FFICallTy(arg_num, tgt_args, tgt_offsets, tgt_entry_ptr); + + VGPU.scheduleAsync(async_info, std::move(Call), team_num); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { + return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr, + tgt_args, tgt_offsets, arg_num, + 1, 1, 0, async_info); +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -21,17 +21,22 @@ #include #include -// List of all plugins that can support offloading. -static const char *RTLNames[] = { - /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", - /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so", - /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", - /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", - /* Remote target */ "libomptarget.rtl.rpc.so", +struct PluginInfoTy { + std::string Name; + bool IsHost; }; +// List of all plugins that can support offloading. +static const PluginInfoTy Plugins[] = { + /* PowerPC target */ {"libomptarget.rtl.ppc64.so", true}, + /* x86_64 target */ {"libomptarget.rtl.x86_64.so", true}, + /* CUDA target */ {"libomptarget.rtl.cuda.so", false}, + /* AArch64 target */ {"libomptarget.rtl.aarch64.so", true}, + /* SX-Aurora VE target */ {"libomptarget.rtl.ve.so", false}, + /* AMDGPU target */ {"libomptarget.rtl.amdgpu.so", false}, + /* Remote target */ {"libomptarget.rtl.rpc.so", false}, + /* Virtual GPU target */ {"libomptarget.rtl.vgpu.so", false}}; + PluginManager *PM; #if OMPTARGET_PROFILE_ENABLED @@ -86,21 +91,37 @@ return; } + // TODO: add ability to inspect image and decide automatically + bool UseVGPU = false; + if (auto *EnvFlag = std::getenv("LIBOMPTARGET_USE_VGPU")) + UseVGPU = true; + DP("Loading RTLs...\n"); // Attempt to open all the plugins and, if they exist, check if the interface // is correct and if they are supporting any devices. - for (auto *Name : RTLNames) { - DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); + for (auto &[Name, IsHost] : Plugins) { + DP("Loading library '%s'...\n", Name.c_str()); + + int Flags = RTLD_NOW; + + if (Name.compare("libomptarget.rtl.vgpu.so") == 0) + Flags |= RTLD_GLOBAL; + + if (UseVGPU && IsHost) { + DP("Skipping library '%s': VGPU was requested.\n", Name.c_str()); + continue; + } + + void *dynlib_handle = dlopen(Name.c_str(), Flags); if (!dynlib_handle) { // Library does not exist or cannot be found. - DP("Unable to load library '%s': %s!\n", Name, dlerror()); + DP("Unable to load library '%s': %s!\n", Name.c_str(), dlerror()); continue; } - DP("Successfully loaded library '%s'!\n", Name); + DP("Successfully loaded library '%s'!\n", Name.c_str()); AllRTLs.emplace_back(); diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt --- a/openmp/libomptarget/test/CMakeLists.txt +++ b/openmp/libomptarget/test/CMakeLists.txt @@ -18,6 +18,9 @@ string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}") foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS) + IF ("${CURRENT_TARGET}" MATCHES "-vgpu") + continue() + ENDIF() string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET) add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET} "Running libomptarget tests"