Diff 314402

openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt

	Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines
	endif()			endif()

	# create libraries			# create libraries
	set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)			set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
	if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)			if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
	set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})			set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
	endif()			endif()

				if (LIBOMPTARGET_AMDGCN_BUILD_AS_HIP)
				set(libomptarget_amdgcn_clang_flags
				-x hip
				-fcuda-rdc
				--cuda-gpu-arch=${mcpu}
				-nogpulib -nogpuinc
				)
				else()
				set(libomptarget_amdgcn_clang_flags
				-x c++
				-c
				-fopenmp
				-fopenmp-cuda-mode # skip data sharing prologue on device functions
				-fopenmp-targets=amdgcn-amd-amdhsa
				-Xopenmp-target=amdgcn-amd-amdhsa
				-march=${mcpu}
				-nogpulib
				-D__AMDGCN__ # Code uses this to distinguish vs nvptx
				)
				endif()


	macro(add_cuda_bc_library)			macro(add_cuda_bc_library)
	set(cu_cmd ${AOMP_BINDIR}/clang++			set(cu_cmd ${AOMP_BINDIR}/clang++
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Trunk clang doesn't accept these flags just yet, but trunk clang also refuses to compile this code as -x hip so this isn't much of a regression. JonChesterfield: Trunk clang doesn't accept these flags just yet, but trunk clang also refuses to compile this…
	-std=c++14			-std=c++14
	-fcuda-rdc			${libomptarget_amdgcn_clang_flags}
	-fvisibility=default			-fvisibility=default
	--cuda-device-only
	-Wno-unused-value			-Wno-unused-value
	-x hip			--cuda-device-only
	-nogpulib -nogpuinc
	-O${optimization_level}			-O${optimization_level}
	--cuda-gpu-arch=${mcpu}
	${CUDA_DEBUG}			${CUDA_DEBUG}
	-I${CMAKE_CURRENT_SOURCE_DIR}/src			-I${CMAKE_CURRENT_SOURCE_DIR}/src
	-I${devicertl_base_directory})			-I${devicertl_base_directory})

	set(bc1_files)			set(bc1_files)

	foreach(file ${ARGN})			foreach(file ${ARGN})
	get_filename_component(fname ${file} NAME_WE)			get_filename_component(fname ${file} NAME_WE)
	Show All 36 Lines

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h

This file was added.

				//===--- amdgcn_intrinsics.h - Intrinsics used by deviceRTL ---------------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#ifndef _AMDGCN_INTRINSICS_H_
				Lint: Pre-merge checks Inline Actions clang-tidy: warning: header guard does not follow preferred style [llvm-header-guard] not useful Lint: Pre-merge checks: clang-tidy: warning: header guard does not follow preferred style [llvm-header-guard] [[https…
				#define _AMDGCN_INTRINSICS_H_

				#ifndef EXTERN
				#error "Expected definition of EXTERN"
				Lint: Pre-merge checks Inline Actions clang-tidy: error: "Expected definition of EXTERN" [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: "Expected definition of EXTERN" [clang-diagnostic-error] [[https://github.
				#endif

				#include <stdint.h>

				#ifdef _OPENMP
				// Openmp doesn't pull these builtins into scope, but does error if the type is
				// incorrect
				// This may be a quirk of openmp's compile for host + device assumption, where
				// these don't resolve to anything on the host

				EXTERN uint32_t __builtin_amdgcn_atomic_inc32(volatile uint32_t *, uint32_t,
				uint32_t, const char *);
				EXTERN void __builtin_amdgcn_s_barrier(void);
				EXTERN void __builtin_amdgcn_fence(uint32_t, const char *);

				EXTERN void __builtin_amdgcn_s_sleep(int);

				EXTERN uint32_t __builtin_amdgcn_workitem_id_x(void);
				EXTERN uint32_t __builtin_amdgcn_workgroup_id_x(void);
				EXTERN uint16_t __builtin_amdgcn_workgroup_size_x(void);
				EXTERN uint32_t __builtin_amdgcn_grid_size_x(void);

				EXTERN uint64_t __builtin_amdgcn_s_memrealtime(void);
				EXTERN uint32_t __builtin_amdgcn_s_getreg(int32_t);
				EXTERN uint64_t __builtin_amdgcn_read_exec(void);

				EXTERN __attribute__((address_space(4))) void *
				__builtin_amdgcn_dispatch_ptr() noexcept;

				EXTERN uint32_t __builtin_amdgcn_mbcnt_lo(uint32_t, uint32_t);
				EXTERN uint32_t __builtin_amdgcn_mbcnt_hi(uint32_t, uint32_t);
				EXTERN int32_t __builtin_amdgcn_ds_bpermute(int32_t, int32_t);
				#endif

				#endif
				jdoerfertUnsubmitted Not Done Reply Inline Actions This seems unrelated, can it go in before or after? jdoerfert: This seems unrelated, can it go in before or after?
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions I think the problem goes: openmp compiles target regions for the host __builtin_amdgcn_s_sleep can't be called from host code (as it's not a thing) therefore compiler doesn't add the symbol to host compilation This works for device compilation, as the names and types match the functions that actually exist. It 'works' for host compilation, in so far as it emits calls to undefined functions with these names, and the compiler throws away the result of the host compilation. Cuda has a similar quirk where calling a device intrinsic from a function that is compiled for the host fails. I'm not sure if nvptx/openmp will have the same behaviour. I'm not sure what the pretty solution to this is. Somewhat inclined to compile target_impl as c++, instead of as openmp, as that bypasses it. I'd quite like to compile the whole library as c++, instead of engaging all the openmp machinery, but clang doesn't handle constructors in address spaces well enough. JonChesterfield: I think the problem goes: - openmp compiles target regions for the host…

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip

	//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//			//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock			// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
	// cannot be implemented - if one thread gets the lock, it can't continue on to			// cannot be implemented - if one thread gets the lock, it can't continue on to
	// the next instruction in order to do anything as the other threads are waiting			// the next instruction in order to do anything as the other threads are waiting
	// to take the lock.			// to take the lock.
	// These functions will be implemented to provide the documented semantics for			// These functions will be implemented to provide the documented semantics for
	// a SIMD => wavefront mapping once that is implemented.			// a SIMD => wavefront mapping once that is implemented.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "common/debug.h"			#include "common/debug.h"

	static DEVICE void warn() {			static DEVICE void warn() {
	PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");			PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
	}			}

	DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }			DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
	DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }			DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
	DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }			DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
	DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }			DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
	DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }			DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip

	//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//			//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "target_impl.h"			#include "target_impl.h"

	// Partially derived fom hcc_detail/device_functions.h			// Partially derived fom hcc_detail/device_functions.h

	// HW_ID Register bit structure			// HW_ID Register bit structure
	// WAVE_ID 3:0 Wave buffer slot number. 0-9.			// WAVE_ID 3:0 Wave buffer slot number. 0-9.
	// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.			// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
	Show All 38 Lines

	DEVICE uint32_t __kmpc_impl_smid() {			DEVICE uint32_t __kmpc_impl_smid() {
	uint32_t cu_id = __builtin_amdgcn_s_getreg(			uint32_t cu_id = __builtin_amdgcn_s_getreg(
	ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));			ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
	uint32_t se_id = __builtin_amdgcn_s_getreg(			uint32_t se_id = __builtin_amdgcn_s_getreg(
	ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));			ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
	return (se_id << HW_ID_CU_ID_SIZE) + cu_id;			return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
	}			}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

	//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//			//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Declarations and definitions of target specific functions and constants			// Declarations and definitions of target specific functions and constants
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H			#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
	#define OMPTARGET_AMDGCN_TARGET_IMPL_H			#define OMPTARGET_AMDGCN_TARGET_IMPL_H

	#ifndef __AMDGCN__			#ifndef __AMDGCN__
	#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"			#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
				Lint: Pre-merge checks Inline Actions clang-tidy: error: "amdgcn target_impl.h expects to be compiled under AMDGCN" [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: "amdgcn target_impl.h expects to be compiled under __AMDGCN__" [clang…
	#endif			#endif

	#include "amdgcn_interface.h"			#include "amdgcn_interface.h"
				#include "amdgcn_intrinsics.h"
				#include "omp_pteam_mem_alloc.h"
				Lint: Pre-merge checks Inline Actions clang-tidy: error: 'omp_pteam_mem_alloc.h' file not found [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: 'omp_pteam_mem_alloc.h' file not found [clang-diagnostic-error] [[https…

	#include <assert.h>			#include <assert.h>
	#include <inttypes.h>			#include <inttypes.h>
	#include <stddef.h>			#include <stddef.h>
	#include <stdint.h>			#include <stdint.h>

				#ifdef _OPENMP
				#define DEVICE
				#else
	#define DEVICE __attribute__((device))			#define DEVICE __attribute__((device))
				#endif

	#define INLINE inline DEVICE			#define INLINE inline DEVICE
	#define NOINLINE __attribute__((noinline)) DEVICE			#define NOINLINE __attribute__((noinline)) DEVICE
	#define SHARED __attribute__((shared))
				#ifdef _OPENMP
				#define SHARED(NAME) OMP_PTEAM_MEM_ALLOC(NAME)
				#define EXTERN_SHARED(NAME) EXTERN_OMP_PTEAM_MEM_ALLOC(NAME)
				#else // HIP
				#define SHARED(NAME) __attribute__((shared)) NAME
				#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME
				#endif

	#define ALIGN(N) __attribute__((aligned(N)))			#define ALIGN(N) __attribute__((aligned(N)))

	#include "hip_atomics.h"			#include "hip_atomics.h"

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Kernel options			// Kernel options
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL			// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team			// max number of threads per team
	#define MAX_THREADS_PER_TEAM 1024			#define MAX_THREADS_PER_TEAM 1024

	#define WARPSIZE 64			#define WARPSIZE 64

	// Maximum number of preallocated arguments to an outlined parallel/simd			// Maximum number of preallocated arguments to an outlined parallel/simd
	// function. Anything more requires dynamic memory allocation.			// function. Anything more requires dynamic memory allocation.
	#define MAX_SHARED_ARGS 20			#define MAX_SHARED_ARGS 20

	// Maximum number of omp state objects per SM allocated statically in global			// Maximum number of omp state objects per SM allocated statically in global
	// memory.			// memory.
	#define OMP_STATE_COUNT 32			#define OMP_STATE_COUNT 32
	#define MAX_SM 64			#define MAX_SM 64
				jdoerfertUnsubmitted Done Reply Inline Actions why the enum? Can we move the _OPENMP stuff in a generic header? jdoerfert: why the enum? Can we move the _OPENMP stuff in a generic header?
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This one was fun. I'd have liked to write '7', but clang sema does some very aggressive checking on the allocator clause that only accepts an enum with the right name and all the right fields filled out. Yes though, we should have a header in common that exports the SHARED and EXTERN_SHARED macros, which will be helpful for nvptx. Provisionally named it omp_pteam_mem_alloc. JonChesterfield: This one was fun. I'd have liked to write '7', but clang sema does some very aggressive…

	#define OMP_ACTIVE_PARALLEL_LEVEL 128			#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.			// Data sharing related quantities, need to match what is used in the compiler.
	enum DATA_SHARING_SIZES {			enum DATA_SHARING_SIZES {
	// The maximum number of workers in a kernel.			// The maximum number of workers in a kernel.
	DS_Max_Worker_Threads = 960,			DS_Max_Worker_Threads = 960,
	// The size reserved for data in a shared memory slot.			// The size reserved for data in a shared memory slot.
	▲ Show 20 Lines • Show All 97 Lines • Show Last 20 Lines

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

	//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//			//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Definitions of target specific functions			// Definitions of target specific functions
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "target_impl.h"			#include "target_impl.h"

	// Implementations initially derived from hcc			// Implementations initially derived from hcc

	// Initialized with a 64-bit mask with bits set in positions less than the			// Initialized with a 64-bit mask with bits set in positions less than the
	// thread's lane number in the warp			// thread's lane number in the warp
	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {			DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
	Show All 40 Lines
	DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,			DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
	uint32_t laneDelta, int32_t width) {			uint32_t laneDelta, int32_t width) {
	int self = GetLaneId();			int self = GetLaneId();
	int index = self + laneDelta;			int index = self + laneDelta;
	index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;			index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
	return __builtin_amdgcn_ds_bpermute(index << 2, var);			return __builtin_amdgcn_ds_bpermute(index << 2, var);
	}			}

	static DEVICE SHARED uint32_t L1_Barrier;			DEVICE uint32_t SHARED(__kmpc_L1_Barrier);

	DEVICE void __kmpc_impl_target_init() {			DEVICE void __kmpc_impl_target_init() {
	// Don't have global ctors, and shared memory is not zero init			// Don't have global ctors, and shared memory is not zero init
	__atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);			__atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE);
	}			}
				jdoerfertUnsubmitted Done Reply Inline Actions Nit: I'd remove the comment. jdoerfert: Nit: I'd remove the comment.
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Happy to. All of the comment, or just the // static? JonChesterfield: Happy to. All of the comment, or just the // static?

	DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {			DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
	__atomic_thread_fence(__ATOMIC_ACQUIRE);			__atomic_thread_fence(__ATOMIC_ACQUIRE);

	uint32_t num_waves = num_threads / WARPSIZE;			uint32_t num_waves = num_threads / WARPSIZE;

	// Partial barrier implementation for amdgcn.			// Partial barrier implementation for amdgcn.
	// Uses two 16 bit unsigned counters. One for the number of waves to have			// Uses two 16 bit unsigned counters. One for the number of waves to have
	// reached the barrier, and one to count how many times the barrier has been			// reached the barrier, and one to count how many times the barrier has been
	// passed. These are packed in a single atomically accessed 32 bit integer.			// passed. These are packed in a single atomically accessed 32 bit integer.
	// Low bits for the number of waves, assumed zero before this call.			// Low bits for the number of waves, assumed zero before this call.
	// High bits to count the number of times the barrier has been passed.			// High bits to count the number of times the barrier has been passed.

	assert(num_waves != 0);			assert(num_waves != 0);
	assert(num_waves * WARPSIZE == num_threads);			assert(num_waves * WARPSIZE == num_threads);
	assert(num_waves < 0xffffu);			assert(num_waves < 0xffffu);

	// Increment the low 16 bits once, using the lowest active thread.			// Increment the low 16 bits once, using the lowest active thread.
	uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;			uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
	bool isLowest = GetLaneId() == lowestActiveThread;			bool isLowest = GetLaneId() == lowestActiveThread;

	if (isLowest) {			if (isLowest) {
	uint32_t load =			uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1,
	__atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative			__ATOMIC_RELAXED); // commutative

	// Record the number of times the barrier has been passed			// Record the number of times the barrier has been passed
	uint32_t generation = load & 0xffff0000u;			uint32_t generation = load & 0xffff0000u;

	if ((load & 0x0000ffffu) == (num_waves - 1)) {			if ((load & 0x0000ffffu) == (num_waves - 1)) {
	// Reached num_waves in low bits so this is the last wave.			// Reached num_waves in low bits so this is the last wave.
	// Set low bits to zero and increment high bits			// Set low bits to zero and increment high bits
	load += 0x00010000u; // wrap is safe			load += 0x00010000u; // wrap is safe
	load &= 0xffff0000u; // because bits zeroed second			load &= 0xffff0000u; // because bits zeroed second

	// Reset the wave counter and release the waiting waves			// Reset the wave counter and release the waiting waves
	__atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);			__atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED);
	} else {			} else {
	// more waves still to go, spin until generation counter changes			// more waves still to go, spin until generation counter changes
	do {			do {
	__builtin_amdgcn_s_sleep(0);			__builtin_amdgcn_s_sleep(0);
	load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);			load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED);
	} while ((load & 0xffff0000u) == generation);			} while ((load & 0xffff0000u) == generation);
	}			}
	}			}
	__atomic_thread_fence(__ATOMIC_RELEASE);			__atomic_thread_fence(__ATOMIC_RELEASE);
	}			}

	namespace {			namespace {
	DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {			DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
	uint32_t q = n / d;			uint32_t q = n / d;
	return q + (n > q * d);			return q + (n > q * d);
	}			}
	DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,			DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
	uint16_t group_size) {			uint16_t group_size) {
	uint32_t r = grid_size - group_id * group_size;			uint32_t r = grid_size - group_id * group_size;
	return (r < group_size) ? r : group_size;			return (r < group_size) ? r : group_size;
	}			}
	} // namespace			} // namespace

	DEVICE int GetNumberOfBlocksInKernel() {			DEVICE int GetNumberOfBlocksInKernel() {
	return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());			return get_grid_dim(__builtin_amdgcn_grid_size_x(),
				__builtin_amdgcn_workgroup_size_x());
	}			}

	DEVICE int GetNumberOfThreadsInBlock() {			DEVICE int GetNumberOfThreadsInBlock() {
	return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),			return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
				__builtin_amdgcn_grid_size_x(),
	__builtin_amdgcn_workgroup_size_x());			__builtin_amdgcn_workgroup_size_x());
	}			}

	DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }			DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
	DEVICE unsigned GetLaneId() {			DEVICE unsigned GetLaneId() {
	return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));			return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
	}			}

	EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {			EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
	return GetNumberOfThreadsInBlock();			return GetNumberOfThreadsInBlock();
	}			}

	// Stub implementations			// Stub implementations
	DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }			DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
	DEVICE void __kmpc_impl_free(void *) {}			DEVICE void __kmpc_impl_free(void *) {}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/omp_pteam_mem_alloc.h

This file was added.

				//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// Macros for allocating variables in shared/LDS/pteam_mem_alloc address space
				//
				//===----------------------------------------------------------------------===//

				#ifndef OMPTARGET_OMP_PTEAM_MEM_ALLOC_H
				Lint: Pre-merge checks Inline Actions clang-tidy: warning: header guard does not follow preferred style [llvm-header-guard] not useful Lint: Pre-merge checks: clang-tidy: warning: header guard does not follow preferred style [llvm-header-guard] [[https…
				#define OMPTARGET_OMP_PTEAM_MEM_ALLOC_H

				#ifdef _OPENMP

				// Follows the pattern in interface.h
				// Clang sema checks this type carefully, needs to closely match that from omp.h
				typedef enum omp_allocator_handle_t {
				omp_null_allocator = 0,
				omp_default_mem_alloc = 1,
				omp_large_cap_mem_alloc = 2,
				omp_const_mem_alloc = 3,
				omp_high_bw_mem_alloc = 4,
				omp_low_lat_mem_alloc = 5,
				omp_cgroup_mem_alloc = 6,
				omp_pteam_mem_alloc = 7,
				omp_thread_mem_alloc = 8,
				KMP_ALLOCATOR_MAX_HANDLE = UINTPTR_MAX
				} omp_allocator_handle_t;

				#define OMP_PRAGMA(STR) _Pragma(STR)
				#define OMP_PRAGMA2(STR) OMP_PRAGMA(#STR)

				#define OMP_PTEAM_MEM_ALLOC(NAME) \
				NAME [[clang::loader_uninitialized]]; \
				OMP_PRAGMA2(omp allocate(NAME) allocator(omp_pteam_mem_alloc))

				#define EXTERN_OMP_PTEAM_MEM_ALLOC(NAME) \
				NAME; \
				OMP_PRAGMA2(omp allocate(NAME) allocator(omp_pteam_mem_alloc))

				#endif

				#endif

openmp/libomptarget/deviceRTLs/common/omptarget.h

//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//		//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file contains the declarations of all library macros, types,		// This file contains the declarations of all library macros, types,
// and functions.		// and functions.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#ifndef OMPTARGET_H		#ifndef OMPTARGET_H
#define OMPTARGET_H		#define OMPTARGET_H

#include "target_impl.h"		#include "target_impl.h"
		Lint: Pre-merge checks Inline Actions clang-tidy: error: 'target_impl.h' file not found [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: 'target_impl.h' file not found [clang-diagnostic-error] [[https://github.
#include "common/debug.h" // debug		#include "common/debug.h" // debug
#include "interface.h" // interfaces with omp, compiler, and user		#include "interface.h" // interfaces with omp, compiler, and user
#include "common/state-queue.h"		#include "common/state-queue.h"
#include "common/support.h"		#include "common/support.h"

#define OMPTARGET_NVPTX_VERSION 1.1		#define OMPTARGET_NVPTX_VERSION 1.1

// used by the library for the interface with the app		// used by the library for the interface with the app
Show All 40 Lines	private:
void *buffer[MAX_SHARED_ARGS];		void *buffer[MAX_SHARED_ARGS];
// pointer to arguments buffer.		// pointer to arguments buffer.
// starts off as a pointer to 'buffer' but can be dynamically allocated.		// starts off as a pointer to 'buffer' but can be dynamically allocated.
void **args;		void **args;
// starts off as MAX_SHARED_ARGS but can increase in size.		// starts off as MAX_SHARED_ARGS but can increase in size.
uint32_t nArgs;		uint32_t nArgs;
};		};

extern DEVICE SHARED omptarget_nvptx_SharedArgs		extern DEVICE
omptarget_nvptx_globalArgs;		omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_SharedArgs' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_SharedArgs' [readability…

// Worker slot type which is initialized with the default worker slot		// Worker slot type which is initialized with the default worker slot
// size of 4*32 bytes.		// size of 4*32 bytes.
struct __kmpc_data_sharing_slot {		struct __kmpc_data_sharing_slot {
__kmpc_data_sharing_slot *Next;		__kmpc_data_sharing_slot *Next;
__kmpc_data_sharing_slot *Prev;		__kmpc_data_sharing_slot *Prev;
void *PrevSlotStackPtr;		void *PrevSlotStackPtr;
void *DataEnd;		void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];		char Data[DS_Worker_Warp_Slot_Size];
};		};

// Data structure to keep in shared memory that traces the current slot, stack,		// Data structure to keep in shared memory that traces the current slot, stack,
// and frame pointer as well as the active threads that didn't exit the current		// and frame pointer as well as the active threads that didn't exit the current
// environment.		// environment.
struct DataSharingStateTy {		struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];		__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];		void *StackPtr[DS_Max_Warp_Number];
void * volatile FramePtr[DS_Max_Warp_Number];		void * volatile FramePtr[DS_Max_Warp_Number];
__kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];		__kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
};		};

extern DEVICE SHARED DataSharingStateTy DataSharingState;		extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// task ICV and (implicit & explicit) task state		// task ICV and (implicit & explicit) task state

class omptarget_nvptx_TaskDescr {		class omptarget_nvptx_TaskDescr {
public:		public:
// methods for flags		// methods for flags
INLINE omp_sched_t GetRuntimeSched() const;		INLINE omp_sched_t GetRuntimeSched() const;
▲ Show 20 Lines • Show All 183 Lines • ▼ Show 20 Lines
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// global data tables		// global data tables
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

extern DEVICE omptarget_nvptx_SimpleMemoryManager		extern DEVICE omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;		omptarget_nvptx_simpleMemoryManager;
extern DEVICE SHARED uint32_t usedMemIdx;		extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming]…
extern DEVICE SHARED uint32_t usedSlotIdx;		extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming]…
extern DEVICE SHARED uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern DEVICE SHARED uint16_t threadLimit;
extern DEVICE SHARED uint16_t threadsInTeam;
extern DEVICE SHARED uint16_t nThreads;
extern DEVICE SHARED
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

extern DEVICE SHARED uint32_t execution_param;		#if _OPENMP
extern DEVICE SHARED void *ReductionScratchpadPtr;		extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
		#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
		JonChesterfieldAuthorUnsubmitted Not Done Reply Inline Actions Note to self: Does the allocate clause need to go on the declaration? According to spec and/or implementation? Ideally the header would only say 'there is a variable somewhere' and the implementation would specify that it's from a particular allocator, but that means IR gen will use a generic address space which may not work out JonChesterfield: Note to self: Does the allocate clause need to go on the declaration? According to spec and/or…
		#else
		extern DEVICE
		uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint8_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint8_t' [readability-identifier-naming]…
		#endif

		extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming]…
		extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming]…
		extern DEVICE uint16_t EXTERN_SHARED(nThreads);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint16_t' [readability-identifier-naming]…
		extern DEVICE omptarget_nvptx_ThreadPrivateContext *
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_ThreadPrivateContext' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_ThreadPrivateContext'…
		EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);

		extern DEVICE uint32_t EXTERN_SHARED(execution_param);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'uint32_t' [readability-identifier-naming]…
		extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for function 'EXTERN_SHARED' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for function 'EXTERN_SHARED' [readability-identifier…

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// work function (outlined parallel/simd functions) and arguments.		// work function (outlined parallel/simd functions) and arguments.
// needed for L1 parallelism only.		// needed for L1 parallelism only.
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

typedef void *omptarget_nvptx_WorkFn;		typedef void *omptarget_nvptx_WorkFn;
extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
omptarget_nvptx_workFn;		extern volatile DEVICE
		omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
		Lint: Pre-merge checks Inline Actions clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_WorkFn' [readability-identifier-naming] not useful Lint: Pre-merge checks: clang-tidy: warning: invalid case style for variable 'omptarget_nvptx_WorkFn' [readability…

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// get private data structures		// get private data structures
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();		INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();		INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
INLINE omptarget_nvptx_TaskDescr *		INLINE omptarget_nvptx_TaskDescr *
Show All 10 Lines

openmp/libomptarget/deviceRTLs/common/src/cancel.cu

	//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//			//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Interface to be used in the implementation of OpenMP cancel.			// Interface to be used in the implementation of OpenMP cancel.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "interface.h"			#include "interface.h"
	#include "common/debug.h"			#include "common/debug.h"

	EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,			EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
	int32_t cancelVal) {			int32_t cancelVal) {
	PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);			PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
	// disabled			// disabled
	return 0;			return 0;
	}			}

	EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,			EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
	int32_t cancelVal) {			int32_t cancelVal) {
	PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);			PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
	// disabled			// disabled
	return 0;			return 0;
	}			}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/critical.cu

	//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//			//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This file contains the implementation of critical with KMPC interface			// This file contains the implementation of critical with KMPC interface
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "interface.h"			#include "interface.h"
	#include "common/debug.h"			#include "common/debug.h"

	EXTERN			EXTERN
	void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,			void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
	kmp_CriticalName *lck) {			kmp_CriticalName *lck) {
	PRINT0(LD_IO, "call to kmpc_critical()\n");			PRINT0(LD_IO, "call to kmpc_critical()\n");
	omp_set_lock((omp_lock_t *)lck);			omp_set_lock((omp_lock_t *)lck);
	}			}

	EXTERN			EXTERN
	void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,			void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
	kmp_CriticalName *lck) {			kmp_CriticalName *lck) {
	PRINT0(LD_IO, "call to kmpc_end_critical()\n");			PRINT0(LD_IO, "call to kmpc_end_critical()\n");
	omp_unset_lock((omp_lock_t *)lck);			omp_unset_lock((omp_lock_t *)lck);
	}			}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//		//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file contains the implementation of data sharing environments		// This file contains the implementation of data sharing environments
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "target_impl.h"		#include "target_impl.h"

// Return true if this is the master thread.		// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {		INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();		return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}		}

▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines	if (isSPMDExecutionMode) {
return;		return;
}		}
__kmpc_impl_threadfence();		__kmpc_impl_threadfence();
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),		ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");		"Must be called only in the target master thread.");
omptarget_nvptx_simpleMemoryManager.Release();		omptarget_nvptx_simpleMemoryManager.Release();
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/libcall.cu

//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//		//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the OpenMP runtime functions that can be		// This file implements the OpenMP runtime functions that can be
// invoked by the user in an OpenMP region		// invoked by the user in an OpenMP region
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "common/target_atomic.h"		#include "common/target_atomic.h"
#include "target_impl.h"		#include "target_impl.h"

EXTERN double omp_get_wtick(void) {		EXTERN double omp_get_wtick(void) {
double rc = __kmpc_impl_get_wtick();		double rc = __kmpc_impl_get_wtick();
PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);		PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
▲ Show 20 Lines • Show All 290 Lines • ▼ Show 20 Lines
}		}

EXTERN int omp_get_team_num() {		EXTERN int omp_get_team_num() {
int rc = GetOmpTeamId();		int rc = GetOmpTeamId();
PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);		PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
return rc;		return rc;
}		}

		// For some reason this function, and only this function, triggers
		// error: definition of builtin function 'omp_is_initial_device'
		// Working around here until the compiler quirk is understood
		#ifdef __AMDGCN__
		DEVICE int omp_is_initial_device_OVERLOAD(void) asm("omp_is_initial_device");
		DEVICE int omp_is_initial_device_OVERLOAD(void) {
		PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
		return 0; // 0 by def on device
		}
		#else
EXTERN int omp_is_initial_device(void) {		EXTERN int omp_is_initial_device(void) {
PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");		PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
return 0; // 0 by def on device		return 0; // 0 by def on device
}		}
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Note to self - don't use this bodge for nvptx JonChesterfield: Note to self - don't use this bodge for nvptx
		jdoerfertUnsubmitted Not Done Reply Inline Actions D38968 is the reason. We should revert that patch as we have context selectors now. jdoerfert: D38968 is the reason. We should revert that patch as we have context selectors now.
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Ah, nice. It's a diy constant folding thing. Agreed that's no longer required. JonChesterfield: Ah, nice. It's a diy constant folding thing. Agreed that's no longer required.
		#endif
// Unspecified on the device.		// Unspecified on the device.
EXTERN int omp_get_initial_device(void) {		EXTERN int omp_get_initial_device(void) {
PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");		PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
return 0;		return 0;
}		}

// Unused for now.		// Unused for now.
EXTERN int omp_get_max_task_priority(void) {		EXTERN int omp_get_max_task_priority(void) {
Show All 25 Lines	EXTERN void omp_unset_lock(omp_lock_t *lock) {
PRINT0(LD_IO, "call omp_unset_lock()\n");		PRINT0(LD_IO, "call omp_unset_lock()\n");
}		}

EXTERN int omp_test_lock(omp_lock_t *lock) {		EXTERN int omp_test_lock(omp_lock_t *lock) {
int rc = __kmpc_impl_test_lock(lock);		int rc = __kmpc_impl_test_lock(lock);
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);		PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
return rc;		return rc;
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/loop.cu

//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//		//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file contains the implementation of the KMPC interface		// This file contains the implementation of the KMPC interface
// for the loop construct plus other worksharing constructs that use the same		// for the loop construct plus other worksharing constructs that use the same
// interface as loops.		// interface as loops.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "target_impl.h"		#include "target_impl.h"
#include "common/target_atomic.h"		#include "common/target_atomic.h"

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// template class that encapsulate all the helper functions		// template class that encapsulate all the helper functions
▲ Show 20 Lines • Show All 727 Lines • ▼ Show 20 Lines	void __kmpc_for_static_init_8u_simple_generic(
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(		omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,		global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/IsSPMDExecutionMode=/false);		/IsSPMDExecutionMode=/false);
}		}

EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {		EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_for_static_fini\n");		PRINT0(LD_IO, "call kmpc_for_static_fini\n");
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/omp_data.cu

	//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//			//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This file contains the data objects used on the GPU device.			// This file contains the data objects used on the GPU device.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "common/omptarget.h"			#include "common/omptarget.h"
	#include "common/device_environment.h"			#include "common/device_environment.h"

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// global device environment			// global device environment
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	DEVICE omptarget_device_environmentTy omptarget_device_environment;			DEVICE omptarget_device_environmentTy omptarget_device_environment;

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// global data holding OpenMP state information			// global data holding OpenMP state information
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	DEVICE			DEVICE
	omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>			omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
	omptarget_nvptx_device_State[MAX_SM];			omptarget_nvptx_device_State[MAX_SM];

	DEVICE omptarget_nvptx_SimpleMemoryManager			DEVICE omptarget_nvptx_SimpleMemoryManager
	omptarget_nvptx_simpleMemoryManager;			omptarget_nvptx_simpleMemoryManager;
	DEVICE SHARED uint32_t usedMemIdx;			DEVICE uint32_t SHARED(usedMemIdx);
	DEVICE SHARED uint32_t usedSlotIdx;			DEVICE uint32_t SHARED(usedSlotIdx);

	DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];			#ifdef _OPENMP
	DEVICE SHARED uint16_t threadLimit;			DEVICE [[clang::loader_uninitialized]] uint8_t
	DEVICE SHARED uint16_t threadsInTeam;			parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
	DEVICE SHARED uint16_t nThreads;			#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
				#else
				DEVICE uint8_t SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
				#endif

				DEVICE uint16_t SHARED(threadLimit);
				DEVICE uint16_t SHARED(threadsInTeam);
				DEVICE uint16_t SHARED(nThreads);
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions This is the only array variable, cleaner to write the pragma allocate here than to introduce another macro JonChesterfield: This is the only array variable, cleaner to write the pragma allocate here than to introduce…
	// Pointer to this team's OpenMP state object			// Pointer to this team's OpenMP state object
	DEVICE SHARED			DEVICE omptarget_nvptx_ThreadPrivateContext *
	omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;			SHARED(omptarget_nvptx_threadPrivateContext);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// The team master sets the outlined parallel function in this variable to			// The team master sets the outlined parallel function in this variable to
	// communicate with the workers. Since it is in shared memory, there is one			// communicate with the workers. Since it is in shared memory, there is one
	// copy of these variables for each kernel, instance, and team.			// copy of these variables for each kernel, instance, and team.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;			volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// OpenMP kernel execution parameters			// OpenMP kernel execution parameters
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	DEVICE SHARED uint32_t execution_param;			DEVICE uint32_t SHARED(execution_param);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Data sharing state			// Data sharing state
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	DEVICE SHARED DataSharingStateTy DataSharingState;			DEVICE DataSharingStateTy SHARED(DataSharingState);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Scratchpad for teams reduction.			// Scratchpad for teams reduction.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	DEVICE SHARED void *ReductionScratchpadPtr;			DEVICE void *SHARED(ReductionScratchpadPtr);

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Data sharing related variables.			// Data sharing related variables.
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;			DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/omptarget.cu

//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//		//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file contains the initialization code for the GPU		// This file contains the initialization code for the GPU
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "target_impl.h"		#include "target_impl.h"

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// global data tables		// global data tables
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

▲ Show 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
}		}
}		}

// Return true if the current target region is executed in SPMD mode.		// Return true if the current target region is executed in SPMD mode.
EXTERN int8_t __kmpc_is_spmd_exec_mode() {		EXTERN int8_t __kmpc_is_spmd_exec_mode() {
PRINT0(LD_IO \| LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");		PRINT0(LD_IO \| LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
return isSPMDMode();		return isSPMDMode();
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/parallel.cu

Show All 25 Lines
// in the parallel loop is that for each barrier in the parallel		// in the parallel loop is that for each barrier in the parallel
// region, these non-included threads will cycle through the		// region, these non-included threads will cycle through the
// syncthread A. Thus they must preserve their current threadId that		// syncthread A. Thus they must preserve their current threadId that
// is larger than thread in team.		// is larger than thread in team.
//		//
// To make a long story short...		// To make a long story short...
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "target_impl.h"		#include "target_impl.h"

////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)		// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////		////////////////////////////////////////////////////////////////////////////////

▲ Show 20 Lines • Show All 253 Lines • ▼ Show 20 Lines	EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
ASSERT0(LT_FUSSY, 0,		ASSERT0(LT_FUSSY, 0,
"should never have anything with new teams on device");		"should never have anything with new teams on device");
}		}

EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,		EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
int proc_bind) {		int proc_bind) {
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);		PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/reduction.cu

//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//		//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file contains the implementation of reduction with KMPC interface.		// This file contains the implementation of reduction with KMPC interface.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"
#include "common/target_atomic.h"		#include "common/target_atomic.h"
#include "target_impl.h"		#include "target_impl.h"

EXTERN		EXTERN
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}		void __kmpc_nvptx_end_reduce(int32_t global_tid) {}

▲ Show 20 Lines • Show All 181 Lines • ▼ Show 20 Lines	EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// In non-generic mode all workers participate in the teams reduction.		// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams		// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.		// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =		uint32_t NumThreads =
checkSPMDMode(loc) ? GetNumberOfOmpThreads(/isSPMDExecutionMode=/true)		checkSPMDMode(loc) ? GetNumberOfOmpThreads(/isSPMDExecutionMode=/true)
: /Master thread only/ 1;		: /Master thread only/ 1;
uint32_t TeamId = GetBlockIdInKernel();		uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();		uint32_t NumTeams = GetNumberOfBlocksInKernel();
static SHARED unsigned Bound;		static unsigned SHARED(Bound);
static SHARED unsigned ChunkTeamCount;		static unsigned SHARED(ChunkTeamCount);

// Block progress for teams greater than the current upper		// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal		// limit. We always only allow a number of teams less or equal
// to the number of slots in the buffer.		// to the number of slots in the buffer.
bool IsMaster = isMaster(loc, ThreadId);		bool IsMaster = isMaster(loc, ThreadId);
while (IsMaster) {		while (IsMaster) {
// Atomic read		// Atomic read
Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);		Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	if (IsMaster && ChunkTeamCount == num_of_records - 1) {
// Allow SIZE number of teams to proceed writing their		// Allow SIZE number of teams to proceed writing their
// intermediate results to the global buffer.		// intermediate results to the global buffer.
__kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));		__kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
}		}

return 0;		return 0;
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/support.cu

	//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//			//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Wrapper implementation to some functions natively supported by the GPU.			// Wrapper implementation to some functions natively supported by the GPU.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "common/support.h"			#include "common/support.h"
	#include "common/debug.h"			#include "common/debug.h"
	#include "common/omptarget.h"			#include "common/omptarget.h"

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Execution Parameters			// Execution Parameters
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	▲ Show 20 Lines • Show All 239 Lines • ▼ Show 20 Lines
	DEVICE unsigned int *GetTeamsReductionTimestamp() {			DEVICE unsigned int *GetTeamsReductionTimestamp() {
	return static_cast<unsigned int *>(ReductionScratchpadPtr);			return static_cast<unsigned int *>(ReductionScratchpadPtr);
	}			}

	DEVICE char *GetTeamsReductionScratchpad() {			DEVICE char *GetTeamsReductionScratchpad() {
	return static_cast<char *>(ReductionScratchpadPtr) + 256;			return static_cast<char *>(ReductionScratchpadPtr) + 256;
	}			}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/sync.cu

	//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//			//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Include all synchronization.			// Include all synchronization.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				#pragma omp declare target

	#include "common/omptarget.h"			#include "common/omptarget.h"
	#include "target_impl.h"			#include "target_impl.h"

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// KMP Ordered calls			// KMP Ordered calls
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Syncwarp			// Syncwarp
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {			EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
	PRINT0(LD_IO, "call __kmpc_syncwarp\n");			PRINT0(LD_IO, "call __kmpc_syncwarp\n");
	__kmpc_impl_syncwarp(Mask);			__kmpc_impl_syncwarp(Mask);
	}			}

				#pragma omp end declare target

openmp/libomptarget/deviceRTLs/common/src/task.cu

Show All 20 Lines
// - private (of size given by task_alloc call). Accessed by		// - private (of size given by task_alloc call). Accessed by
// task+sizeof(klegacy_TaskDescr)		// task+sizeof(klegacy_TaskDescr)
// * private data *		// * private data *
// - shared: X. Accessed by shared ptr in klegacy_TaskDescr		// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
// * pointer table to shared variables *		// * pointer table to shared variables *
// - end		// - end
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		#pragma omp declare target

#include "common/omptarget.h"		#include "common/omptarget.h"

EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(		EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
kmp_Ident *loc, // unused		kmp_Ident *loc, // unused
uint32_t global_tid, // unused		uint32_t global_tid, // unused
int32_t flag, // unused (because in our impl, all are immediately exec		int32_t flag, // unused (because in our impl, all are immediately exec
size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,		size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
▲ Show 20 Lines • Show All 172 Lines • ▼ Show 20 Lines	if (lb > ub)
return;		return;

// the compiler has already stored lb and ub in the kmp_TaskDescr structure		// the compiler has already stored lb and ub in the kmp_TaskDescr structure
// as we are using a single task to execute the entire loop, we can leave		// as we are using a single task to execute the entire loop, we can leave
// the initial task_t untouched		// the initial task_t untouched

__kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);		__kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
}		}

		#pragma omp end declare target

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

	//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//			//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Definitions of target specific functions			// Definitions of target specific functions
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	#ifndef _TARGET_IMPL_H_			#ifndef _TARGET_IMPL_H_
	#define _TARGET_IMPL_H_			#define _TARGET_IMPL_H_

	#include <assert.h>			#include <assert.h>
	#include <cuda.h>			#include <cuda.h>
				Lint: Pre-merge checks Inline Actions clang-tidy: error: 'cuda.h' file not found [clang-diagnostic-error] not useful Lint: Pre-merge checks: clang-tidy: error: 'cuda.h' file not found [clang-diagnostic-error] [[https://github.
	#include <inttypes.h>			#include <inttypes.h>
	#include <stdio.h>			#include <stdio.h>
	#include <stdlib.h>			#include <stdlib.h>

	#include "nvptx_interface.h"			#include "nvptx_interface.h"

	#define DEVICE __device__			#define DEVICE __device__
	#define INLINE __forceinline__ DEVICE			#define INLINE __forceinline__ DEVICE
	#define NOINLINE __noinline__ DEVICE			#define NOINLINE __noinline__ DEVICE
	#define SHARED __shared__
				#define SHARED(NAME) __shared__ NAME
				#define EXTERN_SHARED(NAME) __shared__ NAME

	#define ALIGN(N) __align__(N)			#define ALIGN(N) __align__(N)

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// Kernel options			// Kernel options
	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////			////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL			// The following def must match the absolute limit hardwired in the host RTL
	▲ Show 20 Lines • Show All 191 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget][devicertl] Port amdgcn devicertl to openmp
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 314402

openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

openmp/libomptarget/deviceRTLs/common/omp_pteam_mem_alloc.h

openmp/libomptarget/deviceRTLs/common/omptarget.h

openmp/libomptarget/deviceRTLs/common/src/cancel.cu

openmp/libomptarget/deviceRTLs/common/src/critical.cu

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

openmp/libomptarget/deviceRTLs/common/src/libcall.cu

openmp/libomptarget/deviceRTLs/common/src/loop.cu

openmp/libomptarget/deviceRTLs/common/src/omp_data.cu

openmp/libomptarget/deviceRTLs/common/src/omptarget.cu

openmp/libomptarget/deviceRTLs/common/src/parallel.cu

openmp/libomptarget/deviceRTLs/common/src/reduction.cu

openmp/libomptarget/deviceRTLs/common/src/support.cu

openmp/libomptarget/deviceRTLs/common/src/sync.cu

openmp/libomptarget/deviceRTLs/common/src/task.cu

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

This is an archive of the discontinued LLVM Phabricator instance.

[libomptarget][devicertl] Port amdgcn devicertl to openmpAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 314402

openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_intrinsics.h

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip

openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

openmp/libomptarget/deviceRTLs/common/omp_pteam_mem_alloc.h

openmp/libomptarget/deviceRTLs/common/omptarget.h

openmp/libomptarget/deviceRTLs/common/src/cancel.cu

openmp/libomptarget/deviceRTLs/common/src/critical.cu

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

openmp/libomptarget/deviceRTLs/common/src/libcall.cu

openmp/libomptarget/deviceRTLs/common/src/loop.cu

openmp/libomptarget/deviceRTLs/common/src/omp_data.cu

openmp/libomptarget/deviceRTLs/common/src/omptarget.cu

openmp/libomptarget/deviceRTLs/common/src/parallel.cu

openmp/libomptarget/deviceRTLs/common/src/reduction.cu

openmp/libomptarget/deviceRTLs/common/src/support.cu

openmp/libomptarget/deviceRTLs/common/src/sync.cu

openmp/libomptarget/deviceRTLs/common/src/task.cu

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

[libomptarget][devicertl] Port amdgcn devicertl to openmp
AbandonedPublic