This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
openmp/libomptarget/DeviceRTL/
-
libomptarget/
-
DeviceRTL/
-
include/
-
Synchronization.h
-
src/
-
Synchronization.cpp

Differential D130030

[OpenMP][DeviceRTL] Remove `atomic::store`
AbandonedPublic

Authored by tianshilei1992 on Jul 18 2022, 11:28 AM.

Download Raw Diff

Details

Reviewers

jdoerfert
jhuber6

Summary

atomic::store currently is being used by AMDGPU for named barrier. Internally
it calls __atomic_store_n compiler builtin. However, NVPTX backends doesn't
support AtomicStore instruction, causing backend crash when calling llc on
the device runtime directly, where atomic::store is not optimized out. This
patch removes atomic::store from public area and uses __atomic_store_n
directly where it is needed as a workaround. We can come back and revisit it in
the future is atomic::store is needed somewhere else.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

tianshilei1992 created this revision.Jul 18 2022, 11:28 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 18 2022, 11:28 AM

Herald added subscribers: guansong, yaxunl. · View Herald Transcript

tianshilei1992 requested review of this revision.Jul 18 2022, 11:28 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 18 2022, 11:28 AM

Herald added subscribers: openmp-commits, sstefan1. · View Herald Transcript

LG, though the backend should be able to handle this, IMHO

This revision is now accepted and ready to land.Jul 18 2022, 11:38 AM

In D130030#3660553, @jdoerfert wrote:

LG, though the backend should be able to handle this, IMHO

Yeah, __atomic_exchange_n is expanded based on the ordering, and one of the expansion is still atomic store, which goes back to current situation. atomic::store is only being used by AMDGPU to implement named barrier. I'm wondering if we want to drop atomic::store and use the __atomic_store_n wherever it needs?

refine it

This revision is now accepted and ready to land.Jul 18 2022, 11:59 AM

tianshilei1992 edited the summary of this revision. (Show Details)Jul 18 2022, 12:00 PM

Herald added subscribers: kosarev, tpr. · View Herald TranscriptJul 18 2022, 12:00 PM

tianshilei1992 retitled this revision from [OpenMP][DeviceRTL] Use `__atomic_exchange_n` to implement atomicStore to [OpenMP][DeviceRTL] Remove `atomic::store`.Jul 18 2022, 12:00 PM

tianshilei1992 requested review of this revision.Jul 18 2022, 12:09 PM

Harbormaster completed remote builds in B176091: Diff 445590.Jul 18 2022, 1:19 PM

No, this is strictly worse. If anything, we can introduce a switch to ensure the ordering is known statically. We can use a macro and do it for all of the atomic ops.

In D130030#3663558, @jdoerfert wrote:

No, this is strictly worse. If anything, we can introduce a switch to ensure the ordering is known statically. We can use a macro and do it for all of the atomic ops.

It doesn't work because __atomic_exchange_n with __ATOMIC_RELEASE will be lowered to atomicLoad, which again causes the issue.

tianshilei1992 abandoned this revision.Sep 4 2022, 12:17 PM

Revision Contents

Path

Size

openmp/

libomptarget/

DeviceRTL/

include/

Synchronization.h

3 lines

src/

Synchronization.cpp

18 lines

Diff 445590

openmp/libomptarget/DeviceRTL/include/Synchronization.h

	Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines

	} // namespace fence			} // namespace fence

	namespace atomic {			namespace atomic {

	/// Atomically load \p Addr with \p Ordering semantics.			/// Atomically load \p Addr with \p Ordering semantics.
	uint32_t load(uint32_t *Addr, int Ordering);			uint32_t load(uint32_t *Addr, int Ordering);

	/// Atomically store \p V to \p Addr with \p Ordering semantics.
	void store(uint32_t *Addr, uint32_t V, int Ordering);

	/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.			/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
	uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);			uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);

	/// Atomically add \p V to \p *Addr with \p Ordering semantics.			/// Atomically add \p V to \p *Addr with \p Ordering semantics.
	uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);			uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);

	/// Atomically add \p V to \p *Addr with \p Ordering semantics.			/// Atomically add \p V to \p *Addr with \p Ordering semantics.
	uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);			uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);

	} // namespace atomic			} // namespace atomic

	} // namespace _OMP			} // namespace _OMP

	#endif			#endif

openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Show All 29 Lines
///{		///{
/// NOTE: This function needs to be implemented by every target.		/// NOTE: This function needs to be implemented by every target.
uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);		uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);

uint32_t atomicLoad(uint32_t *Address, int Ordering) {		uint32_t atomicLoad(uint32_t *Address, int Ordering) {
return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);		return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
}		}

void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
__atomic_store_n(Address, Val, Ordering);
}

uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {		uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
return __atomic_fetch_add(Address, Val, Ordering);		return __atomic_fetch_add(Address, Val, Ordering);
}		}
uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) {		uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) {
return __atomic_fetch_max(Address, Val, Ordering);		return __atomic_fetch_max(Address, Val, Ordering);
}		}

uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) {		uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) {
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	case __ATOMIC_SEQ_CST:
return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, "");		return __builtin_amdgcn_atomic_inc32(A, V, __ATOMIC_SEQ_CST, "");
}		}
}		}

uint32_t SHARED(namedBarrierTracker);		uint32_t SHARED(namedBarrierTracker);

void namedBarrierInit() {		void namedBarrierInit() {
// Don't have global ctors, and shared memory is not zero init		// Don't have global ctors, and shared memory is not zero init
atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE);		// FIXME: Using __atomic_store_n here is a workaround because NVPTX backend
		// doesn't support that, which could cause `llc` to crash because of
		// instruction selection failure.
		__atomic_store_n(&namedBarrierTracker, 0u, __ATOMIC_RELEASE);
}		}

void namedBarrier() {		void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();		uint32_t NumThreads = omp_get_num_threads();
// assert(NumThreads % 32 == 0);		// assert(NumThreads % 32 == 0);

uint32_t WarpSize = mapping::getWarpSize();		uint32_t WarpSize = mapping::getWarpSize();
uint32_t NumWaves = NumThreads / WarpSize;		uint32_t NumWaves = NumThreads / WarpSize;
Show All 21 Lines	if (mapping::isLeaderInWarp()) {

if ((load & 0x0000ffffu) == (NumWaves - 1)) {		if ((load & 0x0000ffffu) == (NumWaves - 1)) {
// Reached NumWaves in low bits so this is the last wave.		// Reached NumWaves in low bits so this is the last wave.
// Set low bits to zero and increment high bits		// Set low bits to zero and increment high bits
load += 0x00010000u; // wrap is safe		load += 0x00010000u; // wrap is safe
load &= 0xffff0000u; // because bits zeroed second		load &= 0xffff0000u; // because bits zeroed second

// Reset the wave counter and release the waiting waves		// Reset the wave counter and release the waiting waves
atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED);		// FIXME: Using __atomic_store_n here is a workaround because NVPTX
		// backend doesn't support that, which could cause `llc` to crash because
		// of instruction selection failure.
		__atomic_store_n(&namedBarrierTracker, load, __ATOMIC_RELAXED);
} else {		} else {
// more waves still to go, spin until generation counter changes		// more waves still to go, spin until generation counter changes
do {		do {
__builtin_amdgcn_s_sleep(0);		__builtin_amdgcn_s_sleep(0);
load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);		load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);		} while ((load & 0xffff0000u) == generation);
}		}
}		}
▲ Show 20 Lines • Show All 158 Lines • ▼ Show 20 Lines
void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }		void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }

void fence::system(int Ordering) { impl::fenceSystem(Ordering); }		void fence::system(int Ordering) { impl::fenceSystem(Ordering); }

uint32_t atomic::load(uint32_t *Addr, int Ordering) {		uint32_t atomic::load(uint32_t *Addr, int Ordering) {
return impl::atomicLoad(Addr, Ordering);		return impl::atomicLoad(Addr, Ordering);
}		}

void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
impl::atomicStore(Addr, V, Ordering);
}

uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {		uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
return impl::atomicInc(Addr, V, Ordering);		return impl::atomicInc(Addr, V, Ordering);
}		}

uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) {		uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) {
return impl::atomicAdd(Addr, V, Ordering);		return impl::atomicAdd(Addr, V, Ordering);
}		}

▲ Show 20 Lines • Show All 92 Lines • Show Last 20 Lines