This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Annotate all calls in CUDA device mode as convergent.
AbandonedPublic

Authored by jlebar on Feb 16 2016, 4:48 PM.

Download Raw Diff

Details

Reviewers

majnemer
rnk

Summary

We need the notion of convergent functions -- which may expose
convergent behavior to callers -- and convergent calls, which are calls
where we would like to preserve convergent behavior in the callee, if
possible.

In CUDA device mode, all calls and functions are convergent. The
optimizer can then strip this away under some circumstances.

Diff Detail

Event Timeline

jlebar updated this revision to Diff 48127.Feb 16 2016, 4:48 PM

jlebar retitled this revision from to [CUDA] Annotate all calls in CUDA device mode as convergent..

jlebar updated this object.

jlebar added reviewers: rnk, majnemer.

jlebar added subscribers: tra, cfe-commits.

Fix typo.

jlebar mentioned this in D17056: Mark all CUDA device-side function defs and decls as convergent..Feb 17 2016, 4:53 PM

lgtm

This revision is now accepted and ready to land.Feb 18 2016, 5:52 PM

Subsumed by D17056.

Revision Contents

Path

Size

lib/

CodeGen/

CGCall.cpp

16 lines

test/

CodeGenCUDA/

convergent.cu

6 lines

device-var-init.cu

2 lines

Diff 48143

lib/CodeGen/CGCall.cpp

Show First 20 Lines • Show All 3,133 Lines • ▼ Show 20 Lines	else {
EmitBlock(ContBB);		EmitBlock(ContBB);
}		}

// In ObjC ARC mode with no ObjC ARC exception safety, tell the ARC		// In ObjC ARC mode with no ObjC ARC exception safety, tell the ARC
// optimizer it can aggressively ignore unwind edges.		// optimizer it can aggressively ignore unwind edges.
if (CGM.getLangOpts().ObjCAutoRefCount)		if (CGM.getLangOpts().ObjCAutoRefCount)
AddObjCARCExceptionMetadata(Inst);		AddObjCARCExceptionMetadata(Inst);

return llvm::CallSite(Inst);		llvm::CallSite CS(Inst);
		// All calls in CUDA device mode must conservatively be marked as convergent,
		// preventing some optimizations. The optimizer can remove this if it can
		// prove the the callee is not convergent.
		if (CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice)
		CS.setConvergent();
		return CS;
}		}

/// \brief Store a non-aggregate value to an address to initialize it. For		/// \brief Store a non-aggregate value to an address to initialize it. For
/// initialization, a non-atomic store will be used.		/// initialization, a non-atomic store will be used.
static void EmitInitStoreOfNonAggregate(CodeGenFunction &CGF, RValue Src,		static void EmitInitStoreOfNonAggregate(CodeGenFunction &CGF, RValue Src,
LValue Dst) {		LValue Dst) {
if (Src.isScalar())		if (Src.isScalar())
CGF.EmitStoreOfScalar(Src.getScalarVal(), Dst, /init=/true);		CGF.EmitStoreOfScalar(Src.getScalarVal(), Dst, /init=/true);
▲ Show 20 Lines • Show All 383 Lines • ▼ Show 20 Lines	Attrs =
llvm::Attribute::AlwaysInline);		llvm::Attribute::AlwaysInline);

// Disable inlining inside SEH __try blocks.		// Disable inlining inside SEH __try blocks.
if (isSEHTryScope())		if (isSEHTryScope())
Attrs =		Attrs =
Attrs.addAttribute(getLLVMContext(), llvm::AttributeSet::FunctionIndex,		Attrs.addAttribute(getLLVMContext(), llvm::AttributeSet::FunctionIndex,
llvm::Attribute::NoInline);		llvm::Attribute::NoInline);

		// All calls in CUDA device code are conservatively marked as convergent. The
		// optimizer is able to remove this attribute if it can prove that the callee
		// is not convergent.
		if (CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice)
		Attrs =
		Attrs.addAttribute(getLLVMContext(), llvm::AttributeSet::FunctionIndex,
		llvm::Attribute::Convergent);

CS.setAttributes(Attrs);		CS.setAttributes(Attrs);
CS.setCallingConv(static_cast<llvm::CallingConv::ID>(CallingConv));		CS.setCallingConv(static_cast<llvm::CallingConv::ID>(CallingConv));

// Insert instrumentation or attach profile metadata at indirect call sites		// Insert instrumentation or attach profile metadata at indirect call sites
if (!CS.getCalledFunction())		if (!CS.getCalledFunction())
PGO.valueProfile(Builder, llvm::IPVK_IndirectCallTarget,		PGO.valueProfile(Builder, llvm::IPVK_IndirectCallTarget,
CS.getInstruction(), Callee);		CS.getInstruction(), Callee);

▲ Show 20 Lines • Show All 144 Lines • Show Last 20 Lines

test/CodeGenCUDA/convergent.cu

	Show All 16 Lines

	// HOST: Function Attrs:			// HOST: Function Attrs:
	// HOST-NOT: convergent			// HOST-NOT: convergent
	// HOST-NEXT: define void @_Z3barv			// HOST-NEXT: define void @_Z3barv
	// DEVICE: Function Attrs:			// DEVICE: Function Attrs:
	// DEVICE-SAME: convergent			// DEVICE-SAME: convergent
	// DEVICE-NEXT: define void @_Z3barv			// DEVICE-NEXT: define void @_Z3barv
	__host__ __device__ void baz();			__host__ __device__ void baz();
	__host__ __device__ void bar() { baz(); }			__host__ __device__ void bar() {
				// DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
				baz();
				}

	// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]			// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
	// DEVICE: attributes [[BAZ_ATTR]] = {			// DEVICE: attributes [[BAZ_ATTR]] = {
	// DEVICE-SAME: convergent			// DEVICE-SAME: convergent
	// DEVICE-SAME: }			// DEVICE-SAME: }
				// DEVICE: attributes [[CALL_ATTR]] = { convergent }

	// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]			// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
	// HOST: attributes [[BAZ_ATTR]] = {			// HOST: attributes [[BAZ_ATTR]] = {
	// HOST-NOT: convergent			// HOST-NOT: convergent
	// NOST-SAME: }			// NOST-SAME: }

test/CodeGenCUDA/device-var-init.cu

	Show First 20 Lines • Show All 376 Lines • ▼ Show 20 Lines
	// CHECK: call void @_ZN3NECC1Ev(%struct.NEC* %nec)			// CHECK: call void @_ZN3NECC1Ev(%struct.NEC* %nec)
	// CHECK: call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)			// CHECK: call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)
	// CHECK: call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)			// CHECK: call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)
	// CHECK: call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)			// CHECK: call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)
	// CHECK: call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)			// CHECK: call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)
	// CHECK: call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)			// CHECK: call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)
	// CHECK: call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)			// CHECK: call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)
	// CHECK: call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)			// CHECK: call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)
	// CHECK: call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t) #3			// CHECK: call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t)
	// CHECK: call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)			// CHECK: call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)
	// CHECK: call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)			// CHECK: call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)
	// CHECK: call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)			// CHECK: call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)
	// CHECK: call void @_ZN2UCC1Ev(%struct.UC* addrspacecast (%struct.UC addrspace(3)* @_ZZ2dfvE4s_uc to %struct.UC*))			// CHECK: call void @_ZN2UCC1Ev(%struct.UC* addrspacecast (%struct.UC addrspace(3)* @_ZZ2dfvE4s_uc to %struct.UC*))
	// CHECK: ret void			// CHECK: ret void

	// We should not emit global init function.			// We should not emit global init function.
	// CHECK-NOT: @_GLOBAL__sub_I			// CHECK-NOT: @_GLOBAL__sub_I