diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -252,7 +252,7 @@ std::string BitcodeSuffix; if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime, options::OPT_fno_openmp_target_new_runtime, false)) - BitcodeSuffix = "new-amdgcn-" + GPUArch; + BitcodeSuffix = "new-amdgpu-" + GPUArch; else BitcodeSuffix = "amdgcn-" + GPUArch; diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -48,10 +48,10 @@ uint32_t read(uint32_t *Addr, int Ordering); /// Atomically store \p V to \p Addr with \p Ordering semantics. -uint32_t store(uint32_t *Addr, uint32_t V, int Ordering); +void store(uint32_t *Addr, uint32_t V, int Ordering); /// Atomically store \p V to \p Addr with \p Ordering semantics. -uint64_t store(uint64_t *Addr, uint64_t V, int Ordering); +void store(uint64_t *Addr, uint64_t V, int Ordering); /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering); diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -20,14 +20,14 @@ #pragma omp declare target -extern uint32_t __omp_rtl_debug_kind; +// extern uint32_t __omp_rtl_debug_kind; // TOOD: We want to change the name as soon as the old runtime is gone. DeviceEnvironmentTy CONSTANT(omptarget_device_environment) __attribute__((used)); uint32_t config::getDebugKind() { - return __omp_rtl_debug_kind & omptarget_device_environment.DebugKind; + return /*__omp_rtl_debug_kind &*/ omptarget_device_environment.DebugKind; } uint32_t config::getNumDevices() { diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -29,7 +29,7 @@ #pragma omp begin declare variant match(device = {arch(amdgcn)}) constexpr const llvm::omp::GV &getGridValue() { - return llvm::omp::AMDGPUGridValues; + return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); } uint32_t getGridDim(uint32_t n, uint16_t d) { diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -32,9 +32,18 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); uint32_t atomicRead(uint32_t *Address, int Ordering) { + // TODO: amdgpu can load from memory. nvptx probably can too return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST); } +void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) { + __atomic_store_n(Address, Val, Ordering); +} + +void atomicStore(uint64_t *Address, uint64_t Val, int Ordering) { + __atomic_store_n(Address, Val, Ordering); +} + uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) { return __atomic_fetch_add(Address, Val, Ordering); } @@ -64,11 +73,54 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) +#define FENCE_ATOMIC_CASES() \ + CASE(__ATOMIC_ACQUIRE); \ + CASE(__ATOMIC_RELEASE); \ + CASE(__ATOMIC_ACQ_REL); \ + CASE(__ATOMIC_SEQ_CST) +#define ALL_ATOMIC_CASES() \ + FENCE_ATOMIC_CASES(); \ + CASE(__ATOMIC_RELAXED); \ + CASE(__ATOMIC_CONSUME) + +#if 0 +// Can't spell the dispatch from runtime ordering like: +template +static uint32_t atomicInc(uint32_t *Address, uint32_t Val) { + return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering , ""); +} +// clang/lib/AST/ExprConstant.cpp:14739: bool clang::Expr::EvaluateAsInt(clang::Expr::EvalResult&, const clang::ASTContext&, clang::Expr::SideEffectsKind, bool) const: Assertion `!isValueDependent() && "Expression evaluator can't be called on a dependent expression."' failed. + +// Can spell it with raw macros or struct dispatch, or raise ordering to a +// template parameter on the Synchonization API. Choosing struct dispatch. +#endif + + +namespace { +template struct atomicOpTy; +#define CASE(X) \ + template <> struct atomicOpTy { \ + static uint32_t atomicInc(uint32_t *Address, uint32_t Val) { \ + return __builtin_amdgcn_atomic_inc32(Address, Val, X, ""); \ + } \ + }; +ALL_ATOMIC_CASES(); +#undef CASE +} + uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { - return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, ""); + switch (Ordering) { + default: + __builtin_unreachable(); +#define CASE(X) \ + case X: \ + return atomicOpTy::atomicInc(Address, Val) + FENCE_ATOMIC_CASES(); +#undef CASE + } } -uint32_t SHARD(namedBarrierTracker); +uint32_t SHARED(namedBarrierTracker); void namedBarrierInit() { // Don't have global ctors, and shared memory is not zero init @@ -79,7 +131,7 @@ uint32_t NumThreads = omp_get_num_threads(); // assert(NumThreads % 32 == 0); - uint32_t WarpSize = maping::getWarpSize(); + uint32_t WarpSize = mapping::getWarpSize(); uint32_t NumWaves = NumThreads / WarpSize; fence::team(__ATOMIC_ACQUIRE); @@ -115,24 +167,73 @@ // more waves still to go, spin until generation counter changes do { __builtin_amdgcn_s_sleep(0); - load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED); + load = atomic::read(&namedBarrierTracker, __ATOMIC_RELAXED); } while ((load & 0xffff0000u) == generation); } } fence::team(__ATOMIC_RELEASE); } +namespace { +template struct atomicFenceTy; +#define CASE(X) \ + template <> struct atomicFenceTy { \ + static void fenceTeam() { __builtin_amdgcn_fence(X, "workgroup"); } \ + static void fenceKernel() { __builtin_amdgcn_fence(X, "agent"); } \ + static void fenceSystem() { __builtin_amdgcn_fence(X, ""); } \ + }; +FENCE_ATOMIC_CASES(); +#undef CASE +} + +void fenceTeam(int Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); +#define CASE(X) \ + case X: \ + atomicFenceTy::fenceTeam() + FENCE_ATOMIC_CASES(); +#undef CASE + } +} + +void fenceKernel(int Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); +#define CASE(X) \ + case X: \ + atomicFenceTy::fenceKernel() + FENCE_ATOMIC_CASES(); +#undef CASE + } +} + +void fenceSystem(int Ordering) { + switch (Ordering) { + default: + __builtin_unreachable(); +#define CASE(X) \ + case X: \ + atomicFenceTy::fenceSystem() + FENCE_ATOMIC_CASES(); +#undef CASE + } +} + void syncWarp(__kmpc_impl_lanemask_t) { // AMDGCN doesn't need to sync threads in a warp } void syncThreads() { __builtin_amdgcn_s_barrier(); } -void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); } - -void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); } - -void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); } +// TODO: Don't have wavefront lane locks. Possibly can't have them. +void unsetLock(omp_lock_t *) {__builtin_trap();} +int testLock(omp_lock_t *) {__builtin_trap();} +void initLock(omp_lock_t *) { __builtin_trap();} +void destroyLock(omp_lock_t *) { __builtin_trap();} +void setLock(omp_lock_t *) {__builtin_trap();} #pragma omp end declare variant ///} @@ -192,7 +293,7 @@ void initLock(omp_lock_t *Lock) { unsetLock(Lock); } -void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); } +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. @@ -233,6 +334,14 @@ return impl::atomicRead(Addr, Ordering); } +void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) { + impl::atomicStore(Addr, V, Ordering); +} + +void atomic::store(uint64_t *Addr, uint64_t V, int Ordering) { + impl::atomicStore(Addr, V, Ordering); +} + uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) { return impl::atomicInc(Addr, V, Ordering); } @@ -300,7 +409,7 @@ void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } -void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); } +void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -35,8 +35,9 @@ #pragma omp begin declare variant match(device = {arch(amdgcn)}) void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { - *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF)); - *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32); + static_assert(sizeof(unsigned long) == 8, ""); + *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL); + *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32); } uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { @@ -75,7 +76,7 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { int Width = mapping::getWarpSize(); - int Self = mapping::getgetThreadIdInWarp(); + int Self = mapping::getThreadIdInWarp(); int Index = SrcLane + (Self & ~(Width - 1)); return __builtin_amdgcn_ds_bpermute(Index << 2, Var); }