Diff 282502

clang/lib/CodeGen/CGExpr.cpp

	Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines
	Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,			Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
	const Twine &Name) {			const Twine &Name) {
	CharUnits Align =			CharUnits Align =
	CharUnits::fromQuantity(CGM.getDataLayout().getABITypeAlignment(Ty));			CharUnits::fromQuantity(CGM.getDataLayout().getABITypeAlignment(Ty));
	return CreateTempAlloca(Ty, Align, Name);			return CreateTempAlloca(Ty, Align, Name);
	}			}

	void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {			void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
	assert(isa<llvm::AllocaInst>(Var.getPointer()));			auto *Alloca = Var.getPointer();
	auto Store = new llvm::StoreInst(Init, Var.getPointer(), /volatile*/ false,			assert(isa<llvm::AllocaInst>(Alloca) \|\|
				saiislamAuthorUnsubmitted Done Reply Inline Actions @arsenm , it wasn't possible to post D78495 and this patch separately due to their interdependence. Test "clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp" below checks for this change. saiislam: @arsenm , it wasn't possible to post D78495 and this patch separately due to their…
				(isa<llvm::AddrSpaceCastInst>(Alloca) &&
				isa<llvm::AllocaInst>(
				cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));

				auto Store = new llvm::StoreInst(Init, Alloca, /volatile*/ false,
	Var.getAlignment().getAsAlign());			Var.getAlignment().getAsAlign());
	llvm::BasicBlock *Block = AllocaInsertPt->getParent();			llvm::BasicBlock *Block = AllocaInsertPt->getParent();
	Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);			Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);
	}			}

	Address CodeGenFunction::CreateIRTemp(QualType Ty, const Twine &Name) {			Address CodeGenFunction::CreateIRTemp(QualType Ty, const Twine &Name) {
	CharUnits Align = getContext().getTypeAlignInChars(Ty);			CharUnits Align = getContext().getTypeAlignInChars(Ty);
	return CreateTempAlloca(ConvertType(Ty), Align, Name);			return CreateTempAlloca(ConvertType(Ty), Align, Name);
	▲ Show 20 Lines • Show All 5,202 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h

This file was added.

				//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This provides a class for OpenMP runtime code generation specialized to
				// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
				//
				//===----------------------------------------------------------------------===//

				#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
				#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H

				#include "CGOpenMPRuntime.h"
				#include "CGOpenMPRuntimeGPU.h"
				#include "CodeGenFunction.h"
				#include "clang/AST/StmtOpenMP.h"

				namespace clang {
				namespace CodeGen {

				class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
				ABataevUnsubmitted Not Done Reply Inline Actions `final` ABataev: `final`

				public:
				explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);

				/// Get the GPU warp size.
				llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;

				ABataevUnsubmitted Not Done Reply Inline Actions These 2 lines can be removed, they do not add anything useful. ABataev: These 2 lines can be removed, they do not add anything useful.
				/// Get the id of the current thread on the GPU.
				llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;

				/// Get the maximum number of threads in a block of the GPU.
				llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
				};

				} // namespace CodeGen
				} // namespace clang

				#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp

This file was added.

				//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This provides a class for OpenMP runtime code generation specialized to
				// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
				//
				//===----------------------------------------------------------------------===//

				#include "CGOpenMPRuntimeAMDGCN.h"
				#include "CGOpenMPRuntimeGPU.h"
				#include "CodeGenFunction.h"
				#include "clang/AST/Attr.h"
				#include "clang/AST/DeclOpenMP.h"
				#include "clang/AST/StmtOpenMP.h"
				#include "clang/AST/StmtVisitor.h"
				#include "clang/Basic/Cuda.h"
				#include "llvm/ADT/SmallPtrSet.h"
				#include "llvm/IR/IntrinsicsAMDGPU.h"

				using namespace clang;
				using namespace CodeGen;
				using namespace llvm::omp;

				CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
				: CGOpenMPRuntimeGPU(CGM) {
				if (!CGM.getLangOpts().OpenMPIsDevice)
				llvm_unreachable("OpenMP AMDGCN can only handle device code.");
				}

				llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
				CGBuilderTy &Bld = CGF.Builder;
				// return constant compile-time target-specific warp size
				unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
				return Bld.getInt32(WarpSize);
				}

				llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
				CGBuilderTy &Bld = CGF.Builder;
				llvm::Function *F =
				CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
				return Bld.CreateCall(F, llvm::None, "nvptx_tid");
				}

				llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
				CGBuilderTy &Bld = CGF.Builder;
				llvm::Module *M = &CGF.CGM.getModule();
				const char *LocSize = "__ockl_get_local_size";
				ABataevUnsubmitted Not Done Reply Inline Actions Move these comments to the header instead and remove them from the .cpp module. ABataev: Move these comments to the header instead and remove them from the .cpp module.
				llvm::Function *F = M->getFunction(LocSize);
				if (!F) {
				F = llvm::Function::Create(
				llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
				llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
				}
				return Bld.CreateTrunc(
				Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
				}

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

//===------ CGOpenMPRuntimeGPU.h - Interface to OpenMP GPU Runtimes ------===//		//===------ CGOpenMPRuntimeGPU.h - Interface to OpenMP GPU Runtimes ------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This provides a generalized class for OpenMP runtime code generation		// This provides a generalized class for OpenMP runtime code generation
// specialized by GPU target NVPTX.		// specialized by GPU targets NVPTX and AMDGCN.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H		#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H
#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H		#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEGPU_H

#include "CGOpenMPRuntime.h"		#include "CGOpenMPRuntime.h"
#include "CodeGenFunction.h"		#include "CodeGenFunction.h"
▲ Show 20 Lines • Show All 174 Lines • ▼ Show 20 Lines	protected:
/// For NVPTX target contains data about SPMD/Non-SPMD execution mode +		/// For NVPTX target contains data about SPMD/Non-SPMD execution mode +
/// Full/Lightweight runtime mode. Used for better optimization.		/// Full/Lightweight runtime mode. Used for better optimization.
unsigned getDefaultLocationReserved2Flags() const override;		unsigned getDefaultLocationReserved2Flags() const override;

public:		public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);		explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
void clear() override;		void clear() override;

/// Declare generalized virtual functions which need to be defined		/// Declare generalized virtual functions which need to be defined
/// by all specializations of OpenMPGPURuntime Targets.		/// by all specializations of OpenMPGPURuntime Targets like AMDGCN
		/// and NVPTX.

		/// Get the GPU warp size.
virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;		virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
		ABataevUnsubmitted Not Done Reply Inline Actions Add these notes to the specialized functions too. It is required for better doxygen docs ABataev: Add these notes to the specialized functions too. It is required for better doxygen docs

		/// Get the id of the current thread on the GPU.
		virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;

		/// Get the maximum number of threads in a block of the GPU.
		virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;

/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32		/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.		/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
virtual void emitProcBindClause(CodeGenFunction &CGF,		virtual void emitProcBindClause(CodeGenFunction &CGF,
llvm::omp::ProcBindKind ProcBind,		llvm::omp::ProcBindKind ProcBind,
SourceLocation Loc) override;		SourceLocation Loc) override;

/// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32		/// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'		/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
▲ Show 20 Lines • Show All 283 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//		//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This provides a generalized class for OpenMP runtime code generation		// This provides a generalized class for OpenMP runtime code generation
// specialized by GPU target NVPTX.		// specialized by GPU targets NVPTX and AMDGCN.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "CGOpenMPRuntimeGPU.h"		#include "CGOpenMPRuntimeGPU.h"
#include "CGOpenMPRuntimeNVPTX.h"		#include "CGOpenMPRuntimeNVPTX.h"
#include "CodeGenFunction.h"		#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"		#include "clang/AST/Attr.h"
#include "clang/AST/DeclOpenMP.h"		#include "clang/AST/DeclOpenMP.h"
▲ Show 20 Lines • Show All 597 Lines • ▼ Show 20 Lines	public:
/// Returns the list of the escaped variables with the variably modified		/// Returns the list of the escaped variables with the variably modified
/// types.		/// types.
ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {		ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
return EscapedVariableLengthDecls.getArrayRef();		return EscapedVariableLengthDecls.getArrayRef();
}		}
};		};
} // anonymous namespace		} // anonymous namespace

/// Get the id of the current thread on the GPU.
static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
"nvptx_tid");
}

/// Get the id of the warp in the block.		/// Get the id of the warp in the block.
/// We assume that the warp size is 32, which is always the case		/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.		/// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {		static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =		unsigned LaneIDBits =
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);		CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
		return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}		}

/// Get the id of the current lane in the Warp.		/// Get the id of the current lane in the Warp.
/// We assume that the warp size is 32, which is always the case		/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.		/// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {		static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(		unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
llvm::omp::GV_Warp_Size_Log2_Mask);		llvm::omp::GV_Warp_Size_Log2_Mask);
return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
		return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
"nvptx_lane_id");		"nvptx_lane_id");
}		}

/// Get the maximum number of threads in a block of the GPU.
static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
"nvptx_num_threads");
}

/// Get the value of the thread_limit clause in the teams directive.		/// Get the value of the thread_limit clause in the teams directive.
/// For the 'generic' execution mode, the runtime encodes thread_limit in		/// For the 'generic' execution mode, the runtime encodes thread_limit in
/// the launch parameters, always starting thread_limit+warpSize threads per		/// the launch parameters, always starting thread_limit+warpSize threads per
/// CTA. The threads in the last warp are reserved for master execution.		/// CTA. The threads in the last warp are reserved for master execution.
/// For the 'spmd' execution mode, all threads in a CTA are part of the team.		/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
static llvm::Value *getThreadLimit(CodeGenFunction &CGF,		static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
bool IsInSPMDExecutionMode = false) {		bool IsInSPMDExecutionMode = false) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return IsInSPMDExecutionMode		return IsInSPMDExecutionMode
? getNVPTXNumThreads(CGF)		? RT.getGPUNumThreads(CGF)
: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),		: Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
"thread_limit");		RT.getGPUWarpSize(CGF), "thread_limit");
}		}

/// Get the thread id of the OMP master thread.		/// Get the thread id of the OMP master thread.
/// The master thread id is the first thread (lane) of the last warp in the		/// The master thread id is the first thread (lane) of the last warp in the
/// GPU block. Warp size is assumed to be some power of 2.		/// GPU block. Warp size is assumed to be some power of 2.
/// Thread id is 0 indexed.		/// Thread id is 0 indexed.
/// E.g: If NumThreads is 33, master id is 32.		/// E.g: If NumThreads is 33, master id is 32.
/// If NumThreads is 64, master id is 32.		/// If NumThreads is 64, master id is 32.
/// If NumThreads is 1024, master id is 992.		/// If NumThreads is 1024, master id is 992.
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {		static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
		llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
// We assume that the warp size is a power of 2.		// We assume that the warp size is a power of 2.
llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));		llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));

return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),		return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Bld.CreateNot(Mask), "master_tid");		Bld.CreateNot(Mask), "master_tid");
}		}

CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(		CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
▲ Show 20 Lines • Show All 535 Lines • ▼ Show 20 Lines	void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
WorkerFunctionState &WST) {		WorkerFunctionState &WST) {
CGBuilderTy &Bld = CGF.Builder;		CGBuilderTy &Bld = CGF.Builder;

llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");		llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");		llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");		llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
EST.ExitBB = CGF.createBasicBlock(".exit");		EST.ExitBB = CGF.createBasicBlock(".exit");

		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *IsWorker =		llvm::Value *IsWorker =
Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));		Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);		Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);

CGF.EmitBlock(WorkerBB);		CGF.EmitBlock(WorkerBB);
emitCall(CGF, WST.Loc, WST.WorkerFn);		emitCall(CGF, WST.Loc, WST.WorkerFn);
CGF.EmitBranch(EST.ExitBB);		CGF.EmitBranch(EST.ExitBB);

CGF.EmitBlock(MasterCheckBB);		CGF.EmitBlock(MasterCheckBB);
llvm::Value *IsMaster =		llvm::Value *IsMaster =
Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));		Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);		Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);

CGF.EmitBlock(MasterBB);		CGF.EmitBlock(MasterBB);
IsInTargetMasterThreadRegion = true;		IsInTargetMasterThreadRegion = true;
// SEQUENTIAL (MASTER) REGION START		// SEQUENTIAL (MASTER) REGION START
// First action in sequential region:		// First action in sequential region:
// Initialize the state of the OpenMP runtime library on the GPU.		// Initialize the state of the OpenMP runtime library on the GPU.
// TODO: Optimize runtime initialization and pass in correct value.		// TODO: Optimize runtime initialization and pass in correct value.
▲ Show 20 Lines • Show All 1,518 Lines • ▼ Show 20 Lines	void CGOpenMPRuntimeGPU::emitCriticalRegion(
const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,		const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
const Expr *Hint) {		const Expr *Hint) {
llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");		llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");		llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");		llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");		llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");		llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");

		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

// Get the mask of active threads in the warp.		// Get the mask of active threads in the warp.
llvm::Value *Mask = CGF.EmitRuntimeCall(		llvm::Value *Mask = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));		createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
// Fetch team-local id of the thread.		// Fetch team-local id of the thread.
llvm::Value *ThreadID = getNVPTXThreadID(CGF);		llvm::Value *ThreadID = RT.getGPUThreadID(CGF);

// Get the width of the team.		// Get the width of the team.
llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);		llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);

// Initialize the counter variable for the loop.		// Initialize the counter variable for the loop.
QualType Int32Ty =		QualType Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);		CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);
Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");		Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);		LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,		CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
/isInit=/true);		/isInit=/true);
▲ Show 20 Lines • Show All 446 Lines • ▼ Show 20 Lines	if (!TransferMedium) {
TransferMedium = new llvm::GlobalVariable(		TransferMedium = new llvm::GlobalVariable(
M, Ty, /isConstant=/false, llvm::GlobalVariable::CommonLinkage,		M, Ty, /isConstant=/false, llvm::GlobalVariable::CommonLinkage,
llvm::Constant::getNullValue(Ty), TransferMediumName,		llvm::Constant::getNullValue(Ty), TransferMediumName,
/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,		/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
SharedAddressSpace);		SharedAddressSpace);
CGM.addCompilerUsedGlobal(TransferMedium);		CGM.addCompilerUsedGlobal(TransferMedium);
}		}

		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the CUDA thread id of the current OpenMP thread on the GPU.		// Get the CUDA thread id of the current OpenMP thread on the GPU.
llvm::Value *ThreadID = getNVPTXThreadID(CGF);		llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// nvptx_lane_id = nvptx_id % warpsize		// nvptx_lane_id = nvptx_id % warpsize
llvm::Value *LaneID = getNVPTXLaneID(CGF);		llvm::Value *LaneID = getNVPTXLaneID(CGF);
// nvptx_warp_id = nvptx_id / warpsize		// nvptx_warp_id = nvptx_id / warpsize
llvm::Value *WarpID = getNVPTXWarpID(CGF);		llvm::Value *WarpID = getNVPTXWarpID(CGF);

Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);		Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address LocalReduceList(		Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(		Bld.CreatePointerBitCastOrAddrSpaceCast(
▲ Show 20 Lines • Show All 1,576 Lines • ▼ Show 20 Lines	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
FunctionGlobalizedDecls.erase(CGF.CurFn);		FunctionGlobalizedDecls.erase(CGF.CurFn);
CGOpenMPRuntime::functionFinished(CGF);		CGOpenMPRuntime::functionFinished(CGF);
}		}

void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(		void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
CodeGenFunction &CGF, const OMPLoopDirective &S,		CodeGenFunction &CGF, const OMPLoopDirective &S,
OpenMPDistScheduleClauseKind &ScheduleKind,		OpenMPDistScheduleClauseKind &ScheduleKind,
llvm::Value *&Chunk) const {		llvm::Value *&Chunk) const {
		auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {		if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
ScheduleKind = OMPC_DIST_SCHEDULE_static;		ScheduleKind = OMPC_DIST_SCHEDULE_static;
Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),		Chunk = CGF.EmitScalarConversion(
		RT.getGPUNumThreads(CGF),
CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),		CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
S.getIterationVariable()->getType(), S.getBeginLoc());		S.getIterationVariable()->getType(), S.getBeginLoc());
return;		return;
}		}
CGOpenMPRuntime::getDefaultDistScheduleAndChunk(		CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
CGF, S, ScheduleKind, Chunk);		CGF, S, ScheduleKind, Chunk);
}		}

▲ Show 20 Lines • Show All 379 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

	Show All 16 Lines
	#include "CGOpenMPRuntime.h"			#include "CGOpenMPRuntime.h"
	#include "CGOpenMPRuntimeGPU.h"			#include "CGOpenMPRuntimeGPU.h"
	#include "CodeGenFunction.h"			#include "CodeGenFunction.h"
	#include "clang/AST/StmtOpenMP.h"			#include "clang/AST/StmtOpenMP.h"

	namespace clang {			namespace clang {
	namespace CodeGen {			namespace CodeGen {

	class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {			class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
				ABataevUnsubmitted Not Done Reply Inline Actions `final` ABataev: `final`

	public:			public:
	explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);			explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);

				/// Get the GPU warp size.
	llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;			llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
				ABataevUnsubmitted Not Done Reply Inline Actions These 2 lines can be removed, they do not add anything useful. ABataev: These 2 lines can be removed, they do not add anything useful.

				/// Get the id of the current thread on the GPU.
				llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;

				/// Get the maximum number of threads in a block of the GPU.
				llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
	};			};

	} // CodeGen namespace.			} // CodeGen namespace.
	} // clang namespace.			} // clang namespace.

	#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H			#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

	Show All 26 Lines
	using namespace llvm::omp;			using namespace llvm::omp;

	CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)			CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
	: CGOpenMPRuntimeGPU(CGM) {			: CGOpenMPRuntimeGPU(CGM) {
	if (!CGM.getLangOpts().OpenMPIsDevice)			if (!CGM.getLangOpts().OpenMPIsDevice)
	llvm_unreachable("OpenMP NVPTX can only handle device code.");			llvm_unreachable("OpenMP NVPTX can only handle device code.");
	}			}

	/// Get the GPU warp size.
	llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {			llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
	return CGF.EmitRuntimeCall(			return CGF.EmitRuntimeCall(
	llvm::Intrinsic::getDeclaration(			llvm::Intrinsic::getDeclaration(
	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),			&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
	"nvptx_warp_size");			"nvptx_warp_size");
	}			}

				llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
				ABataevUnsubmitted Not Done Reply Inline Actions Same, move these comments to the header. ABataev: Same, move these comments to the header.
				CGBuilderTy &Bld = CGF.Builder;
				llvm::Function *F;
				F = llvm::Intrinsic::getDeclaration(
				&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
				return Bld.CreateCall(F, llvm::None, "nvptx_tid");
				}

				llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
				CGBuilderTy &Bld = CGF.Builder;
				llvm::Function *F;
				F = llvm::Intrinsic::getDeclaration(
				&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
				return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
				}

clang/lib/CodeGen/CMakeLists.txt

Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	add_clang_library(clangCodeGen
CGLoopInfo.cpp		CGLoopInfo.cpp
CGNonTrivialStruct.cpp		CGNonTrivialStruct.cpp
CGObjC.cpp		CGObjC.cpp
CGObjCGNU.cpp		CGObjCGNU.cpp
CGObjCMac.cpp		CGObjCMac.cpp
CGObjCRuntime.cpp		CGObjCRuntime.cpp
CGOpenCLRuntime.cpp		CGOpenCLRuntime.cpp
CGOpenMPRuntime.cpp		CGOpenMPRuntime.cpp
		CGOpenMPRuntimeAMDGCN.cpp
CGOpenMPRuntimeGPU.cpp		CGOpenMPRuntimeGPU.cpp
CGOpenMPRuntimeNVPTX.cpp		CGOpenMPRuntimeNVPTX.cpp
CGRecordLayoutBuilder.cpp		CGRecordLayoutBuilder.cpp
CGStmt.cpp		CGStmt.cpp
CGStmtOpenMP.cpp		CGStmtOpenMP.cpp
CGVTT.cpp		CGVTT.cpp
CGVTables.cpp		CGVTables.cpp
CodeGenABITypes.cpp		CodeGenABITypes.cpp
Show All 31 Lines

clang/lib/CodeGen/CodeGenModule.cpp

Show All 13 Lines
#include "CGBlocks.h"		#include "CGBlocks.h"
#include "CGCUDARuntime.h"		#include "CGCUDARuntime.h"
#include "CGCXXABI.h"		#include "CGCXXABI.h"
#include "CGCall.h"		#include "CGCall.h"
#include "CGDebugInfo.h"		#include "CGDebugInfo.h"
#include "CGObjCRuntime.h"		#include "CGObjCRuntime.h"
#include "CGOpenCLRuntime.h"		#include "CGOpenCLRuntime.h"
#include "CGOpenMPRuntime.h"		#include "CGOpenMPRuntime.h"
		#include "CGOpenMPRuntimeAMDGCN.h"
#include "CGOpenMPRuntimeNVPTX.h"		#include "CGOpenMPRuntimeNVPTX.h"
#include "CodeGenFunction.h"		#include "CodeGenFunction.h"
#include "CodeGenPGO.h"		#include "CodeGenPGO.h"
#include "ConstantEmitter.h"		#include "ConstantEmitter.h"
#include "CoverageMappingGen.h"		#include "CoverageMappingGen.h"
#include "TargetInfo.h"		#include "TargetInfo.h"
#include "clang/AST/ASTContext.h"		#include "clang/AST/ASTContext.h"
#include "clang/AST/CharUnits.h"		#include "clang/AST/CharUnits.h"
▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Lines	void CodeGenModule::createOpenMPRuntime() {
// If it does not exist use the default implementation.		// If it does not exist use the default implementation.
switch (getTriple().getArch()) {		switch (getTriple().getArch()) {
case llvm::Triple::nvptx:		case llvm::Triple::nvptx:
case llvm::Triple::nvptx64:		case llvm::Triple::nvptx64:
assert(getLangOpts().OpenMPIsDevice &&		assert(getLangOpts().OpenMPIsDevice &&
"OpenMP NVPTX is only prepared to deal with device code.");		"OpenMP NVPTX is only prepared to deal with device code.");
OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));		OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
break;		break;
		case llvm::Triple::amdgcn:
		assert(getLangOpts().OpenMPIsDevice &&
		"OpenMP AMDGCN is only prepared to deal with device code.");
		OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
		break;
default:		default:
if (LangOpts.OpenMPSimd)		if (LangOpts.OpenMPSimd)
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));		OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
else		else
OpenMPRuntime.reset(new CGOpenMPRuntime(*this));		OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
break;		break;
}		}
}		}
▲ Show 20 Lines • Show All 5,842 Lines • Show Last 20 Lines

clang/test/OpenMP/amdgcn_target_codegen.cpp

This file was added.

				// REQUIRES: amdgpu-registered-target

				// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
				// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - \| FileCheck %s
				// expected-no-diagnostics
				#ifndef HEADER
				#define HEADER

				#define N 1000

				int test_amdgcn_target_tid_threads() {
				// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads

				int arr[N];
				ABataevUnsubmitted Not Done Reply Inline Actions `entry:` not always exist in the LLVM IR functions, better not to check for it. ABataev: `entry:` not always exist in the LLVM IR functions, better not to check for it.

				// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
				// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
				// CHECK-NEXT: sub nuw i32 [[VAR]], 64
				// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
				#pragma omp target
				ABataevUnsubmitted Not Done Reply Inline Actions Same, the order of these checks should be strict ABataev: Same, the order of these checks should be strict
				ABataevUnsubmitted Not Done Reply Inline Actions Remove `DAG` ABataev: Remove `DAG`
				for (int i = 0; i < N; i++) {
				ABataevUnsubmitted Not Done Reply Inline Actions Do not rely on names for the locals, some release builds do not generate them. Use regexps instead. ABataev: Do not rely on names for the locals, some release builds do not generate them. Use regexps…
				arr[i] = 1;
				}

				return arr[0];
				}

				int test_amdgcn_target_tid_threads_simd() {
				// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd

				int arr[N];

				// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
				// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
				// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
				#pragma omp target simd
				ABataevUnsubmitted Not Done Reply Inline Actions It should not be `CHECK-DAG`, the order of these 2 instructions is defined and the second one should definitely follow the first one ABataev: It should not be `CHECK-DAG`, the order of these 2 instructions is defined and the second one…
				for (int i = 0; i < N; i++) {
				arr[i] = 1;
				}
				return arr[0];
				}

				#endif

clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp

This file was added.

				// REQUIRES: amdgpu-registered-target

				// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
				// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - \| FileCheck %s
				// expected-no-diagnostics

				#define N 100

				int test_amdgcn_target_temp_alloca() {
				// CHECK-LABEL: test_amdgcn_target_temp_alloca

				int arr[N];
				ABataevUnsubmitted Not Done Reply Inline Actions `entry:` again ABataev: `entry:` again

				// CHECK: [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5)
				// CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]**
				// CHECK: store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8

				ABataevUnsubmitted Not Done Reply Inline Actions same, use regexps instead of %-like names ABataev: same, use regexps instead of %-like names
				#pragma omp target
				ABataevUnsubmitted Not Done Reply Inline Actions Just `CHECK` ABataev: Just `CHECK`
				ABataevUnsubmitted Not Done Reply Inline Actions Do not rely on the names again. Even as part of regexps ABataev: Do not rely on the names again. Even as part of regexps
				for (int i = 0; i < N; i++) {
				arr[i] = 1;
				}

				return arr[0];
				}

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 282502

clang/lib/CodeGen/CGExpr.cpp

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

clang/lib/CodeGen/CMakeLists.txt

clang/lib/CodeGen/CodeGenModule.cpp

clang/test/OpenMP/amdgcn_target_codegen.cpp

clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 282502

clang/lib/CodeGen/CGExpr.cpp

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h

clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp

clang/lib/CodeGen/CGOpenMPRuntimeGPU.h

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

clang/lib/CodeGen/CMakeLists.txt

clang/lib/CodeGen/CodeGenModule.cpp

clang/test/OpenMP/amdgcn_target_codegen.cpp

clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp

[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3
ClosedPublic