diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3866,7 +3866,8 @@ TT.getArch() == llvm::Triple::nvptx64 || TT.getArch() == llvm::Triple::amdgcn || TT.getArch() == llvm::Triple::x86 || - TT.getArch() == llvm::Triple::x86_64)) + TT.getArch() == llvm::Triple::x86_64 || + TT.getArch() == llvm::Triple::spir64)) Diags.Report(diag::err_drv_invalid_omp_target) << A->getValue(i); else if (getArchPtrSize(T) != getArchPtrSize(TT)) Diags.Report(diag::err_drv_incompatible_omp_arch) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -120,6 +120,18 @@ 128, // GV_Default_WG_Size }; +/// For Intel GPUs +static constexpr GV SPIR64GridValues64 = { + 256, // GV_Slot_Size + 64, // GV_Warp_Size + (1 << 16), // GV_Max_Teams + 440, // GV_Default_Num_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size, + 256, // GV_Default_WG_Size +}; + + } // namespace omp } // namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -1210,6 +1210,7 @@ __OMP_TRAIT_PROPERTY(device, arch, amdgcn) __OMP_TRAIT_PROPERTY(device, arch, nvptx) __OMP_TRAIT_PROPERTY(device, arch, nvptx64) +__OMP_TRAIT_PROPERTY(device, arch, spir64) __OMP_TRAIT_SET(implementation) diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -51,6 +51,7 @@ case Triple::amdgcn: case Triple::nvptx: case Triple::nvptx64: + case Triple::spir64: ActiveTraits.set(unsigned(TraitProperty::device_kind_gpu)); break; default: diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -281,6 +281,10 @@ endif() endforeach() +add_custom_target(omptarget.devicertl.spir64) +# TODO Adapt names/versions here +compileDeviceRTLLibrary(spir64 spir64 spir64-intel-l0 -fopenmp-targets=spir64) + # Archive all the object files generated above into a static library add_library(omptarget.devicertl STATIC) set_target_properties(omptarget.devicertl PROPERTIES LINKER_LANGUAGE CXX) diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h --- a/openmp/libomptarget/DeviceRTL/include/Types.h +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -140,6 +140,11 @@ using LaneMaskTy = uint64_t; #pragma omp end declare variant +#pragma omp begin declare variant match(device = {arch(spir64)}) +// TODO verify this +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + namespace lanes { enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; } // namespace lanes diff --git a/openmp/libomptarget/DeviceRTL/src/LibC.cpp b/openmp/libomptarget/DeviceRTL/src/LibC.cpp --- a/openmp/libomptarget/DeviceRTL/src/LibC.cpp +++ b/openmp/libomptarget/DeviceRTL/src/LibC.cpp @@ -33,6 +33,15 @@ } // namespace impl #pragma omp end declare variant +/// Intel implementation +#pragma omp begin declare variant match(device = {arch(spir64)}) +namespace impl { +int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return -1; +} +} // namespace impl +#pragma omp end declare variant + extern "C" { int memcmp(const void *lhs, const void *rhs, size_t count) { diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -151,6 +151,65 @@ #pragma omp end declare variant ///} +/// Intel implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +extern size_t __spirv_BuiltInGlobalSize(int); +extern size_t __spirv_BuiltInLocalInvocationId(int); +extern size_t __spirv_BuiltInWorkgroupId(int); +extern size_t __spirv_BuiltInWorkgroupSize(int); + +const llvm::omp::GV &getGridValue() { return llvm::omp::SPIR64GridValues64; } + +uint32_t getNumHardwareThreadsInBlock() { + return __spirv_BuiltInWorkgroupSize(0); +} + +LaneMaskTy activemask() { return -1; } + +LaneMaskTy lanemaskLT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + int64_t Ballot = mapping::activemask(); + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + int64_t Ballot = mapping::activemask(); + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { return __spirv_BuiltInLocalInvocationId(0); } + +uint32_t getKernelSize() { return __spirv_BuiltInGlobalSize(0); } + +uint32_t getBlockId() { return __spirv_BuiltInWorkgroupId(0); } + +uint32_t getNumberOfBlocks() { + return __spirv_BuiltInGlobalSize(0) / __spirv_BuiltInWorkgroupSize(0); +} + +uint32_t getWarpId() { + return impl::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getNumberOfWarpsInBlock() { + return mapping::getBlockSize() / mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } } // namespace impl diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -58,6 +58,18 @@ #pragma omp end declare variant +/// Intel implementation +#pragma omp begin declare variant match(device = {arch(spir64)}) + +double getWTick() { return ((double)1E-9); } + +double getWTime() { + // TODO + return 0; +} + +#pragma omp end declare variant + } // namespace impl } // namespace ompx diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -66,6 +66,23 @@ #pragma omp end declare variant ///} +/// Intel implementations of the shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +extern "C" { +void *malloc(uint64_t Size) { + // TODO: Use some preallocated space for dynamic malloc. + return nullptr; +} + +void free(void *Ptr) {} +} + +#pragma omp end declare variant +///} + /// A "smart" stack in shared memory. /// /// The stack exposes a malloc/free interface but works like a stack internally. diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -351,6 +351,138 @@ #pragma omp end declare variant ///} +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +extern int __spirv_AtomicIAdd(int *, int, int, int); +extern void __spirv_MemoryBarrier(int, int); +extern void __spirv_ControlBarrier(int, int, int); + +typedef enum { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4 +} Scope_t; + +typedef enum { + Relaxed = 0x0, + Acquire = 0x2, + Release = 0x4, + AcquireRelease = 0x8, + SequentiallyConsistent = 0x10, + UniformMemory = 0x40, + SubgroupMemory = 0x80, + WorkgroupMemory = 0x100, + CrossWorkgroupMemory = 0x200, + AtomicCounterMemory = 0x400, + ImageMemory = 0x800 +} MemorySemantics_t; + +uint32_t atomicInc(uint32_t *uA, uint32_t V, atomic::OrderingTy Ordering) { + int *A = (int *)A; + switch (Ordering) { + default: + __builtin_unreachable(); + case atomic::relaxed: + return __spirv_AtomicIAdd(A, Scope_t::Device, MemorySemantics_t::Relaxed, + V); + case atomic::aquire: + return __spirv_AtomicIAdd(A, Scope_t::Device, MemorySemantics_t::Acquire, + V); + case atomic::release: + return __spirv_AtomicIAdd(A, Scope_t::Device, MemorySemantics_t::Release, + V); + case atomic::acq_rel: + return __spirv_AtomicIAdd(A, Scope_t::Device, + MemorySemantics_t::AcquireRelease, V); + case atomic::seq_cst: + return __spirv_AtomicIAdd(A, Scope_t::Device, + MemorySemantics_t::SequentiallyConsistent, V); + } +} + +void namedBarrierInit() { __builtin_trap(); } // TODO + +void namedBarrier() { __builtin_trap(); } // TODO + +void fenceTeam(atomic::OrderingTy Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); + case atomic::aquire: + return __spirv_MemoryBarrier(Scope_t::Workgroup, + MemorySemantics_t::Acquire); + case atomic::release: + return __spirv_MemoryBarrier(Scope_t::Workgroup, + MemorySemantics_t::Release); + case atomic::acq_rel: + return __spirv_MemoryBarrier(Scope_t::Workgroup, + MemorySemantics_t::AcquireRelease); + case atomic::seq_cst: + return __spirv_MemoryBarrier(Scope_t::Workgroup, + MemorySemantics_t::SequentiallyConsistent); + } +} +void fenceKernel(atomic::OrderingTy Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); + case atomic::aquire: + return __spirv_MemoryBarrier(Scope_t::Invocation, + MemorySemantics_t::Acquire); + case atomic::release: + return __spirv_MemoryBarrier(Scope_t::Invocation, + MemorySemantics_t::Release); + case atomic::acq_rel: + return __spirv_MemoryBarrier(Scope_t::Invocation, + MemorySemantics_t::AcquireRelease); + case atomic::seq_cst: + return __spirv_MemoryBarrier(Scope_t::Invocation, + MemorySemantics_t::SequentiallyConsistent); + } +} +void fenceSystem(atomic::OrderingTy Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); + case atomic::aquire: + return __spirv_MemoryBarrier(Scope_t::Device, MemorySemantics_t::Acquire); + case atomic::release: + return __spirv_MemoryBarrier(Scope_t::Device, MemorySemantics_t::Release); + case atomic::acq_rel: + return __spirv_MemoryBarrier(Scope_t::Device, + MemorySemantics_t::AcquireRelease); + case atomic::seq_cst: + return __spirv_MemoryBarrier(Scope_t::Device, + MemorySemantics_t::SequentiallyConsistent); + } +} + +void syncWarp(__kmpc_impl_lanemask_t) { + __spirv_ControlBarrier(Scope_t::Subgroup, Scope_t::Invocation, + MemorySemantics_t::Acquire); +} + +void syncThreads() { + __spirv_ControlBarrier(Scope_t::Workgroup, Scope_t::Invocation, + MemorySemantics_t::Acquire); +} + +void syncThreadsAligned() { syncThreads(); } + +void unsetLock(omp_lock_t *) { __builtin_trap(); } +int testLock(omp_lock_t *) { __builtin_trap(); } +void initLock(omp_lock_t *) { __builtin_trap(); } +void destroyLock(omp_lock_t *) { __builtin_trap(); } +void setLock(omp_lock_t *) { __builtin_trap(); } + +#pragma omp end declare variant +///} + } // namespace impl void synchronize::init(bool IsSPMD) { diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -79,6 +79,24 @@ #pragma omp end declare variant ///} +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + static_assert(sizeof(unsigned long) == 8, ""); + *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL); + *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant +///} + int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, int32_t Width); @@ -103,8 +121,9 @@ return __builtin_amdgcn_ds_bpermute(Index << 2, Var); } -bool isSharedMemPtr(const void * Ptr) { - return __builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void *)Ptr); +bool isSharedMemPtr(const void *Ptr) { + return __builtin_amdgcn_is_shared( + (const __attribute__((address_space(0))) void *)Ptr); } #pragma omp end declare variant ///} @@ -126,6 +145,27 @@ bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } +#pragma omp end declare variant +///} + +/// Intel Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(spir64)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { // TODO + __builtin_trap(); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width) { // TODO + __builtin_trap(); +} + +bool isSharedMemPtr(const void *Ptr) { // TODO + __builtin_trap(); +} + #pragma omp end declare variant ///} } // namespace impl