This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Specialize amdgpu devicertl on wave size for gfx10
Needs ReviewPublic

Authored by JonChesterfield on Aug 19 2021, 12:00 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
ronlieb
dpalermo
dhruvachak

Summary

Use 32 bit arithmetic instead of relying on llvm to recognise
that the high half of various uint64_t values is zero for wave32 code.

Performance optimisation only. Relies on D108380 and D108391.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

JonChesterfield created this revision.Aug 19 2021, 12:00 PM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptAug 19 2021, 12:00 PM

JonChesterfield requested review of this revision.Aug 19 2021, 12:00 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptAug 19 2021, 12:00 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: openmp-commits, sstefan1, wdng. · View Herald Transcript

JonChesterfield removed a reviewer: jdoerfert.Aug 19 2021, 12:01 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptAug 19 2021, 12:01 PM

Harbormaster completed remote builds in B120394: Diff 367579.Aug 19 2021, 12:01 PM

Low priority, posting it so I don't forget about it. Would remove the only reviewer but phab automatically re-adds you.

Only useful after D108708 has landed

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
145	This should be a 'runtime' switch on gridvalues, will update the patch

JonChesterfield added reviewers: ronlieb, dpalermo, dhruvachak.Aug 27 2021, 4:23 AM

JonChesterfield added inline comments.

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
48	As a performance optimisation, this is probably in the noise. However it will eliminate all the warp32 vs wave64 differences in the deviceRTL, making gfx10 a useful datapoint for debugging works on nvptx and fails on amdgpu. That is, if gfx10 works, it suggests the bug is in wave size. If it fails, it suggests the bug is not in wave size.

less macros

Harbormaster completed remote builds in B123273: Diff 371669.Sep 9 2021, 1:12 PM

Revision Contents

Path

Size

openmp/

libomptarget/

deviceRTLs/

amdgcn/

src/

target_impl.h

10 lines

target_impl.hip

14 lines

Diff 367579

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

	Show All 19 Lines

	#include <stddef.h>			#include <stddef.h>
	#include <stdint.h>			#include <stdint.h>

	// subset of inttypes.h			// subset of inttypes.h
	#define PRId64 "ld"			#define PRId64 "ld"
	#define PRIu64 "lu"			#define PRIu64 "lu"

	typedef uint64_t __kmpc_impl_lanemask_t;

	#define INLINE inline			#define INLINE inline
	#define NOINLINE __attribute__((noinline))			#define NOINLINE __attribute__((noinline))
	#define ALIGN(N) __attribute__((aligned(N)))			#define ALIGN(N) __attribute__((aligned(N)))

	#include "llvm/Frontend/OpenMP/OMPGridValues.h"			#include "llvm/Frontend/OpenMP/OMPGridValues.h"

	INLINE constexpr const llvm::omp::GV &getGridValue() {			INLINE constexpr const llvm::omp::GV &getGridValue() {
	return llvm::omp::AMDGPUGridValues;			return llvm::omp::AMDGPUGridValues;
	}			}

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Kernel options			// Kernel options
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL			// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team			// max number of threads per team
	enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };			enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
	enum { WARPSIZE = getGridValue().GV_Warp_Size };			enum { WARPSIZE = getGridValue().GV_Warp_Size };

				namespace detail {
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions As a performance optimisation, this is probably in the noise. However it will eliminate all the warp32 vs wave64 differences in the deviceRTL, making gfx10 a useful datapoint for debugging works on nvptx and fails on amdgpu. That is, if gfx10 works, it suggests the bug is in wave size. If it fails, it suggests the bug is not in wave size. JonChesterfield: As a performance optimisation, this is probably in the noise. However it will eliminate all…
				template <unsigned> struct UnsignedToType;
				template <> struct UnsignedToType<64u> { using type = uint64_t; };
				template <> struct UnsignedToType<32u> { using type = uint32_t; };
				} // namespace detail

				using __kmpc_impl_lanemask_t = detail::UnsignedToType<WARPSIZE>::type;

	// Maximum number of omp state objects per SM allocated statically in global			// Maximum number of omp state objects per SM allocated statically in global
	// memory.			// memory.
	#define OMP_STATE_COUNT 32			#define OMP_STATE_COUNT 32
	#define MAX_SM 64			#define MAX_SM 64

	#define OMP_ACTIVE_PARALLEL_LEVEL 128			#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.			// Data sharing related quantities, need to match what is used in the compiler.
	Show All 20 Lines

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	EXTERN double __kmpc_impl_get_wtime() {
// This will probably need to be found by measurement on a number of		// This will probably need to be found by measurement on a number of
// architectures. Until then, return 0, which is very inaccurate as a		// architectures. Until then, return 0, which is very inaccurate as a
// timer but resolves the undefined symbol at link time.		// timer but resolves the undefined symbol at link time.
return 0;		return 0;
}		}

// Warp vote function		// Warp vote function
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {		EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
		#if __AMDGCN_WAVEFRONT_SIZE == 64
return __builtin_amdgcn_read_exec();		return __builtin_amdgcn_read_exec();
		#elif __AMDGCN_WAVEFRONT_SIZE == 32
		return __builtin_amdgcn_read_exec_lo();
		#else
		#error "Unexpected WAVEFRONT_SIZE"
		#endif
}		}

static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {		static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
__atomic_thread_fence(__ATOMIC_ACQUIRE);		__atomic_thread_fence(__ATOMIC_ACQUIRE);

uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;		uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;

// Partial barrier implementation for amdgcn.		// Partial barrier implementation for amdgcn.
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	EXTERN int __kmpc_get_hardware_num_threads_in_block() {
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),		return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
__builtin_amdgcn_grid_size_x(),		__builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());		__builtin_amdgcn_workgroup_size_x());
}		}

EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }		EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
EXTERN unsigned GetWarpSize() { return WARPSIZE; }		EXTERN unsigned GetWarpSize() { return WARPSIZE; }
EXTERN unsigned GetLaneId() {		EXTERN unsigned GetLaneId() {
		#if __AMDGCN_WAVEFRONT_SIZE == 64
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This should be a 'runtime' switch on gridvalues, will update the patch JonChesterfield: This should be a 'runtime' switch on gridvalues, will update the patch
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));		return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
		#elif __AMDGCN_WAVEFRONT_SIZE == 32
		return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
		#else
		#error "Unexpected WAVEFRONT_SIZE"
		#endif
}		}

EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {		EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
return __kmpc_get_hardware_num_threads_in_block();		return __kmpc_get_hardware_num_threads_in_block();
}		}

// Atomics		// Atomics
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {		uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines