This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget] Specialize amdgpu devicertl on wave size for gfx10
Needs ReviewPublic

Authored by JonChesterfield on Aug 19 2021, 12:00 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
ronlieb
dpalermo
dhruvachak

Summary

Use 32 bit arithmetic instead of relying on llvm to recognise
that the high half of various uint64_t values is zero for wave32 code.

Performance optimisation only. Relies on D108380 and D108391.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

JonChesterfield created this revision.Aug 19 2021, 12:00 PM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptAug 19 2021, 12:00 PM

JonChesterfield requested review of this revision.Aug 19 2021, 12:00 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptAug 19 2021, 12:00 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: openmp-commits, sstefan1, wdng. · View Herald Transcript

JonChesterfield removed a reviewer: jdoerfert.Aug 19 2021, 12:01 PM

Herald added a reviewer: jdoerfert. · View Herald TranscriptAug 19 2021, 12:01 PM

Harbormaster completed remote builds in B120394: Diff 367579.Aug 19 2021, 12:01 PM

Low priority, posting it so I don't forget about it. Would remove the only reviewer but phab automatically re-adds you.

Only useful after D108708 has landed

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
144–149	This should be a 'runtime' switch on gridvalues, will update the patch

JonChesterfield added reviewers: ronlieb, dpalermo, dhruvachak.Aug 27 2021, 4:23 AM

JonChesterfield added inline comments.

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
52	As a performance optimisation, this is probably in the noise. However it will eliminate all the warp32 vs wave64 differences in the deviceRTL, making gfx10 a useful datapoint for debugging works on nvptx and fails on amdgpu. That is, if gfx10 works, it suggests the bug is in wave size. If it fails, it suggests the bug is not in wave size.

less macros

Harbormaster completed remote builds in B123273: Diff 371669.Sep 9 2021, 1:12 PM

Revision Contents

Path

Size

openmp/

libomptarget/

deviceRTLs/

amdgcn/

src/

target_impl.h

10 lines

target_impl.hip

14 lines

Diff 371669

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

	//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//			//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Declarations and definitions of target specific functions and constants			// Declarations and definitions of target specific functions and constants
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H			#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
	#define OMPTARGET_AMDGCN_TARGET_IMPL_H			#define OMPTARGET_AMDGCN_TARGET_IMPL_H

	#ifndef __AMDGCN__			#ifndef __AMDGCN__
	#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"			#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
				Lint: Pre-merge checks Inline Actions clang-tidy: error: "amdgcn target_impl.h expects to be compiled under AMDGCN" [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: "amdgcn target_impl.h expects to be compiled under __AMDGCN__" [clang…
	#endif			#endif

	#include "amdgcn_interface.h"			#include "amdgcn_interface.h"

	#include <stddef.h>			#include <stddef.h>
	#include <stdint.h>			#include <stdint.h>

	// subset of inttypes.h			// subset of inttypes.h
	#define PRId64 "ld"			#define PRId64 "ld"
	#define PRIu64 "lu"			#define PRIu64 "lu"

	typedef uint64_t __kmpc_impl_lanemask_t;

	#define INLINE inline			#define INLINE inline
	#define NOINLINE __attribute__((noinline))			#define NOINLINE __attribute__((noinline))
	#define ALIGN(N) __attribute__((aligned(N)))			#define ALIGN(N) __attribute__((aligned(N)))
	#define PLUGIN_ACCESSIBLE \			#define PLUGIN_ACCESSIBLE \
	__attribute__((used)) /* Don't discard values the plugin reads */ \			__attribute__((used)) /* Don't discard values the plugin reads */ \
	__attribute__((visibility("default"))) /* Access via SHT_HASH */ \			__attribute__((visibility("default"))) /* Access via SHT_HASH */ \
	__attribute__((section(".data"))) /* Not .bss, can write before load */			__attribute__((section(".data"))) /* Not .bss, can write before load */

	#include "llvm/Frontend/OpenMP/OMPGridValues.h"			#include "llvm/Frontend/OpenMP/OMPGridValues.h"

	INLINE constexpr const llvm::omp::GV &getGridValue() {			INLINE constexpr const llvm::omp::GV &getGridValue() {
				Lint: Pre-merge checks Inline Actions clang-tidy: error: no return statement in constexpr function [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: no return statement in constexpr function [clang-diagnostic-error] [[https…
	return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();			return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
				Lint: Pre-merge checks Inline Actions clang-tidy: error: use of undeclared identifier '__AMDGCN_WAVEFRONT_SIZE' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: use of undeclared identifier '__AMDGCN_WAVEFRONT_SIZE' [clang-diagnostic…
	}			}

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Kernel options			// Kernel options
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL			// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team			// max number of threads per team
	enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };			enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
	enum { WARPSIZE = getGridValue().GV_Warp_Size };			enum { WARPSIZE = getGridValue().GV_Warp_Size };

				namespace detail {
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions As a performance optimisation, this is probably in the noise. However it will eliminate all the warp32 vs wave64 differences in the deviceRTL, making gfx10 a useful datapoint for debugging works on nvptx and fails on amdgpu. That is, if gfx10 works, it suggests the bug is in wave size. If it fails, it suggests the bug is not in wave size. JonChesterfield: As a performance optimisation, this is probably in the noise. However it will eliminate all…
				template <unsigned> struct UnsignedToType;
				template <> struct UnsignedToType<64u> { using type = uint64_t; };
				template <> struct UnsignedToType<32u> { using type = uint32_t; };
				} // namespace detail

				using __kmpc_impl_lanemask_t = detail::UnsignedToType<WARPSIZE>::type;
				Lint: Pre-merge checks Inline Actions clang-tidy: error: implicit instantiation of undefined template 'detail::UnsignedToType<0>' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: implicit instantiation of undefined template 'detail::UnsignedToType<0>'…

	// Maximum number of omp state objects per SM allocated statically in global			// Maximum number of omp state objects per SM allocated statically in global
	// memory.			// memory.
	#define OMP_STATE_COUNT 32			#define OMP_STATE_COUNT 32
	#define MAX_SM 64			#define MAX_SM 64

	#define OMP_ACTIVE_PARALLEL_LEVEL 128			#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.			// Data sharing related quantities, need to match what is used in the compiler.
	enum DATA_SHARING_SIZES {			enum DATA_SHARING_SIZES {
	// The size reserved for data in a shared memory slot.			// The size reserved for data in a shared memory slot.
	DS_Slot_Size = getGridValue().GV_Slot_Size,			DS_Slot_Size = getGridValue().GV_Slot_Size,
	// The slot size that should be reserved for a working warp.			// The slot size that should be reserved for a working warp.
	DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),			DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
	// The maximum number of warps in use			// The maximum number of warps in use
	DS_Max_Warp_Number = getGridValue().maxWarpNumber(),			DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
	};			};

	enum : __kmpc_impl_lanemask_t {			enum : __kmpc_impl_lanemask_t {
				Lint: Pre-merge checks Inline Actions clang-tidy: error: unknown type name '__kmpc_impl_lanemask_t' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: unknown type name '__kmpc_impl_lanemask_t' [clang-diagnostic-error] [[https…
	__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0			__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
				Lint: Pre-merge checks Inline Actions clang-tidy: error: use of undeclared identifier '__kmpc_impl_lanemask_t' [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: use of undeclared identifier '__kmpc_impl_lanemask_t' [clang-diagnostic…
	};			};

	// The return code of printf is not checked in the call sites in this library.			// The return code of printf is not checked in the call sites in this library.
	// A call to a function named printf currently hits some special case handling			// A call to a function named printf currently hits some special case handling
	// for opencl, which translates to calls that do not presently exist for openmp			// for opencl, which translates to calls that do not presently exist for openmp
	// Therefore, for now, stub out printf while building this library.			// Therefore, for now, stub out printf while building this library.
	#define printf(...)			#define printf(...)

	#endif			#endif

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	EXTERN double __kmpc_impl_get_wtime() {
// This will probably need to be found by measurement on a number of		// This will probably need to be found by measurement on a number of
// architectures. Until then, return 0, which is very inaccurate as a		// architectures. Until then, return 0, which is very inaccurate as a
// timer but resolves the undefined symbol at link time.		// timer but resolves the undefined symbol at link time.
return 0;		return 0;
}		}

// Warp vote function		// Warp vote function
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {		EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
		static_assert(WARPSIZE == 64 \|\| WARPSIZE == 32, "");
		if (WARPSIZE == 64) {
return __builtin_amdgcn_read_exec();		return __builtin_amdgcn_read_exec();
		} else {
		return __builtin_amdgcn_read_exec_lo();
		}
}		}

static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {		static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
__atomic_thread_fence(__ATOMIC_ACQUIRE);		__atomic_thread_fence(__ATOMIC_ACQUIRE);

uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;		uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;

// Partial barrier implementation for amdgcn.		// Partial barrier implementation for amdgcn.
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	EXTERN int __kmpc_get_hardware_num_threads_in_block() {
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),		return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
__builtin_amdgcn_grid_size_x(),		__builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());		__builtin_amdgcn_workgroup_size_x());
}		}

EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }		EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
EXTERN unsigned GetWarpSize() { return WARPSIZE; }		EXTERN unsigned GetWarpSize() { return WARPSIZE; }
EXTERN unsigned GetLaneId() {		EXTERN unsigned GetLaneId() {
		static_assert(WARPSIZE == 64 \|\| WARPSIZE == 32, "");
		if (WARPSIZE == 64) {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));		return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
		} else {
		return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
		}
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This should be a 'runtime' switch on gridvalues, will update the patch JonChesterfield: This should be a 'runtime' switch on gridvalues, will update the patch
}		}

EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {		EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
return __kmpc_get_hardware_num_threads_in_block();		return __kmpc_get_hardware_num_threads_in_block();
}		}

// Atomics		// Atomics
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {		uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines