Changeset View
Changeset View
Standalone View
Standalone View
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
///} | ///} | ||||
/// AMDGCN Implementation | /// AMDGCN Implementation | ||||
/// | /// | ||||
///{ | ///{ | ||||
#pragma omp begin declare variant match(device = {arch(amdgcn)}) | #pragma omp begin declare variant match(device = {arch(amdgcn)}) | ||||
uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { | uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering) { | ||||
return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, ""); | // builtin_amdgcn_atomic_inc32 should expand to this switch when | ||||
// passed a runtime value, but does not do so yet. Workaround here. | |||||
switch (Ordering) { | |||||
default: | |||||
__builtin_unreachable(); | |||||
case __ATOMIC_RELAXED: | |||||
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELAXED, ""); | |||||
case __ATOMIC_ACQUIRE: | |||||
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQUIRE, ""); | |||||
case __ATOMIC_RELEASE: | |||||
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_RELEASE, ""); | |||||
case __ATOMIC_ACQ_REL: | |||||
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_ACQ_REL, ""); | |||||
case __ATOMIC_SEQ_CST: | |||||
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, ""); | |||||
} | |||||
JonChesterfield: This is not good, need to revise sema checking on these intrinsics and add some lowering in… | |||||
} | } | ||||
uint32_t SHARED(namedBarrierTracker); | uint32_t SHARED(namedBarrierTracker); | ||||
void namedBarrierInit() { | void namedBarrierInit() { | ||||
// Don't have global ctors, and shared memory is not zero init | // Don't have global ctors, and shared memory is not zero init | ||||
atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE); | atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE); | ||||
} | } | ||||
Show All 40 Lines | if ((load & 0x0000ffffu) == (NumWaves - 1)) { | ||||
__builtin_amdgcn_s_sleep(0); | __builtin_amdgcn_s_sleep(0); | ||||
load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED); | load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED); | ||||
} while ((load & 0xffff0000u) == generation); | } while ((load & 0xffff0000u) == generation); | ||||
} | } | ||||
} | } | ||||
fence::team(__ATOMIC_RELEASE); | fence::team(__ATOMIC_RELEASE); | ||||
} | } | ||||
// sema checking of amdgcn_fence is aggressive. Intention is to patch clang | |||||
// so that it is usable within a template environment and so that a runtime | |||||
// value of the memory order is expanded to this switch within clang/llvm. | |||||
void fenceTeam(int Ordering) { | |||||
switch (Ordering) { | |||||
default: | |||||
__builtin_unreachable(); | |||||
case __ATOMIC_ACQUIRE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); | |||||
case __ATOMIC_RELEASE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); | |||||
case __ATOMIC_ACQ_REL: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup"); | |||||
case __ATOMIC_SEQ_CST: | |||||
return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); | |||||
} | |||||
} | |||||
void fenceKernel(int Ordering) { | |||||
switch (Ordering) { | |||||
default: | |||||
__builtin_unreachable(); | |||||
case __ATOMIC_ACQUIRE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); | |||||
case __ATOMIC_RELEASE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); | |||||
case __ATOMIC_ACQ_REL: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); | |||||
case __ATOMIC_SEQ_CST: | |||||
return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); | |||||
} | |||||
} | |||||
void fenceSystem(int Ordering) { | |||||
switch (Ordering) { | |||||
default: | |||||
__builtin_unreachable(); | |||||
case __ATOMIC_ACQUIRE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ""); | |||||
case __ATOMIC_RELEASE: | |||||
return __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); | |||||
case __ATOMIC_ACQ_REL: | |||||
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); | |||||
case __ATOMIC_SEQ_CST: | |||||
return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); | |||||
} | |||||
} | |||||
void syncWarp(__kmpc_impl_lanemask_t) { | void syncWarp(__kmpc_impl_lanemask_t) { | ||||
// AMDGCN doesn't need to sync threads in a warp | // AMDGCN doesn't need to sync threads in a warp | ||||
} | } | ||||
void syncThreads() { __builtin_amdgcn_s_barrier(); } | void syncThreads() { __builtin_amdgcn_s_barrier(); } | ||||
void syncThreadsAligned() { syncThreads(); } | void syncThreadsAligned() { syncThreads(); } | ||||
void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); } | // TODO: Don't have wavefront lane locks. Possibly can't have them. | ||||
void unsetLock(omp_lock_t *) { __builtin_trap(); } | |||||
void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); } | int testLock(omp_lock_t *) { __builtin_trap(); } | ||||
void initLock(omp_lock_t *) { __builtin_trap(); } | |||||
void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); } | void destroyLock(omp_lock_t *) { __builtin_trap(); } | ||||
void setLock(omp_lock_t *) { __builtin_trap(); } | |||||
Error here - syncThreadsAligned is deleted but should not be JonChesterfield: Error here - syncThreadsAligned is deleted but should not be | |||||
#pragma omp end declare variant | #pragma omp end declare variant | ||||
///} | ///} | ||||
/// NVPTX Implementation | /// NVPTX Implementation | ||||
/// | /// | ||||
///{ | ///{ | ||||
#pragma omp begin declare variant match( \ | #pragma omp begin declare variant match( \ | ||||
▲ Show 20 Lines • Show All 177 Lines • Show Last 20 Lines |
This is not good, need to revise sema checking on these intrinsics and add some lowering in clang/llvm that builds the switch. Written longhand here to get things running.