diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -120,6 +120,18 @@ 128, // GV_Default_WG_Size }; +/// For Intel GPUs +static constexpr GV SPIR64GridValues64 = { + 256, // GV_Slot_Size + 64, // GV_Warp_Size + (1 << 16), // GV_Max_Teams + 440, // GV_Default_Num_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size, + 256, // GV_Default_WG_Size +}; + + } // namespace omp } // namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -1210,6 +1210,7 @@ __OMP_TRAIT_PROPERTY(device, arch, amdgcn) __OMP_TRAIT_PROPERTY(device, arch, nvptx) __OMP_TRAIT_PROPERTY(device, arch, nvptx64) +__OMP_TRAIT_PROPERTY(device, arch, spir64) __OMP_TRAIT_SET(implementation) diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -51,6 +51,7 @@ case Triple::amdgcn: case Triple::nvptx: case Triple::nvptx64: + case Triple::spir64: ActiveTraits.set(unsigned(TraitProperty::device_kind_gpu)); break; default: diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -278,6 +278,10 @@ compileDeviceRTLLibrary(${mcpu} amdgpu amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa) endforeach() +add_custom_target(omptarget.devicertl.spir64) +# TODO Adapt names/versions here +compileDeviceRTLLibrary(spir64 spir64 spir64-intel-l0 -fopenmp-targets=spir64) + # Archive all the object files generated above into a static library add_library(omptarget.devicertl STATIC) set_target_properties(omptarget.devicertl PROPERTIES LINKER_LANGUAGE CXX) diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -140,6 +140,11 @@ using LaneMaskTy = uint64_t; #pragma omp end declare variant +#pragma omp begin declare variant match(device = {arch(spir64)}) +// TODO verify this +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + namespace lanes { enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; } // namespace lanes diff --git a/openmp/libomptarget/DeviceRTL/src/LibC.cpp b/openmp/libomptarget/DeviceRTL/src/LibC.cpp --- a/openmp/libomptarget/DeviceRTL/src/LibC.cpp +++ b/openmp/libomptarget/DeviceRTL/src/LibC.cpp @@ -33,6 +33,15 @@ } // namespace impl #pragma omp end declare variant +/// Intel implementation +#pragma omp begin declare variant match(device = {arch(spir64)}) +namespace impl { +int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return -1; +} +} // namespace impl +#pragma omp end declare variant + extern "C" { int memcmp(const void *lhs, const void *rhs, size_t count) { diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -151,6 +151,60 @@ #pragma omp end declare variant ///} +/// Intel implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +const llvm::omp::GV &getGridValue() { return llvm::omp::SPIR64GridValues64; } + +uint32_t getNumHardwareThreadsInBlock() { + return __spirv_BuiltInWorkgroupSize(0); +} + +LaneMaskTy activemask() { return -1; } + +LaneMaskTy lanemaskLT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + int64_t Ballot = mapping::activemask(); + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + int64_t Ballot = mapping::activemask(); + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { return __spirv_BuiltInLocalInvocationId(0); } + +uint32_t getKernelSize() { return __spirv_BuiltInGlobalSize(0); } + +uint32_t getBlockId() { return __spirv_BuiltInWorkgroupId(0); } + +uint32_t getNumberOfBlocks() { + return __spirv_BuiltInGlobalSize(0) / __spirv_BuiltInWorkgroupSize(0); +} + +uint32_t getWarpId() { + return impl::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getNumberOfWarpsInBlock() { + return mapping::getBlockSize() / mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } } // namespace impl diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -58,6 +58,18 @@ #pragma omp end declare variant +/// Intel implementation +#pragma omp begin declare variant match(device = {arch(spir64)}) + +double getWTick() { return ((double)1E-9); } + +double getWTime() { + // TODO + return 0; +} + +#pragma omp end declare variant + } // namespace impl } // namespace ompx diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -66,6 +66,23 @@ #pragma omp end declare variant ///} +/// Intel implementations of the shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +extern "C" { +void *malloc(uint64_t Size) { + // TODO: Use some preallocated space for dynamic malloc. + return nullptr; +} + +void free(void *Ptr) {} +} + +#pragma omp end declare variant +///} + /// A "smart" stack in shared memory. /// /// The stack exposes a malloc/free interface but works like a stack internally. diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -351,6 +351,62 @@ #pragma omp end declare variant ///} +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) { + // TODO +} + +uint32_t SHARED(namedBarrierTracker); + +void namedBarrierInit() { + // TODO +} + +void namedBarrier() { + // TODO +} + +// sema checking of amdgcn_fence is aggressive. Intention is to patch clang +// so that it is usable within a template environment and so that a runtime +// value of the memory order is expanded to this switch within clang/llvm. +void fenceTeam(atomic::OrderingTy Ordering) { + // TODO +} +void fenceKernel(atomic::OrderingTy Ordering) { + // TODO +} +void fenceSystem(atomic::OrderingTy Ordering) { + // TODO +} + +void syncWarp(__kmpc_impl_lanemask_t) { + // TODO +} + +void syncThreads() { // TODO +} +void syncThreadsAligned() { // TODO +} + +// TODO: Don't have wavefront lane locks. Possibly can't have them. +void unsetLock(omp_lock_t *) { // TODO +} +int testLock(omp_lock_t *) { // TODO +} +void initLock(omp_lock_t *) { // TODO +} +void destroyLock(omp_lock_t *) { // TODO +} +void setLock(omp_lock_t *) { // TODO +} + +#pragma omp end declare variant +///} + } // namespace impl void synchronize::init(bool IsSPMD) { diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -79,6 +79,24 @@ #pragma omp end declare variant ///} +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + static_assert(sizeof(unsigned long) == 8, ""); + *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL); + *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant +///} + int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, int32_t Width); @@ -126,6 +144,25 @@ bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } +#pragma omp end declare variant +///} + +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + // TODO +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + // TODO +} + +bool isSharedMemPtr(const void *Ptr) { // TODO +} + #pragma omp end declare variant ///} } // namespace impl