This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Implemented __nvvm_atom__gen_ builtins.
ClosedPublic

Authored by tra on Jun 23 2015, 11:20 AM.

Download Raw Diff

Details

Reviewers

jholewinski
eliben
echristo

Commits

rGd21e5c668464: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.
rC240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.
rL240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.

Summary

Implemented __nvvm_atom_*_gen_* builtins.

Integer variants are implmented as atomicrmw or cmpxchg instructions.

Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.xxx LVVM intrinsic.

Diff Detail

Event Timeline

tra updated this revision to Diff 28268.Jun 23 2015, 11:20 AM

tra retitled this revision from to [CUDA] Implemented __nvvm_atom_*_gen_* builtins..

tra updated this object.

tra edited the test plan for this revision. (Show Details)

tra added reviewers: jholewinski, eliben, echristo.

tra added a subscriber: Unknown Object (MLST).

eliben added inline comments.Jun 24 2015, 8:08 AM

lib/CodeGen/CGBuiltin.cpp
112	Don't really need the Result temporary here?
118	You can just "return RValue::get(MakeBinary...)" -- similarly to how it's done below, for consistency
165	Please document all function parameters in the comment (especially ReturnBool)
174	Would universal initialization be simpler here? Value *Args[3] = {CGF.Builder.CreateBi......, CGF.EmitScalar...}
6865	I'm wondering if there's some unwritten rule saying that all target builtins should be crammed into a single file... It's closing in on 7KLOC now and no end in sight. Would it be very bad for NVPTX to have its own CGBuiltinNVPTX or something like that? Clang splits classes to multiple files (Sema, CodeGenFunction, etc) already... Eric?

Addressed Eli's comments.

lib/CodeGen/CGBuiltin.cpp
112	Done.
118	Done.
165	Done.
174	Args[1] creation is a two-step operation which gets in a way.

eliben added inline comments.Jun 24 2015, 3:46 PM

lib/CodeGen/CGBuiltin.cpp
174	You could obtain the type from Args[1] through the cast (getSrcTy) and avoid the awkward repeated assignment to Args[1]. But this isn't super important - up to you.

LGTM

I'd move the NVPTX builtin emission code to its own file unless others have strong objections, but looks good otherwise.

lib/CodeGen/CGBuiltin.cpp
192	I would just "return EmitFromInt" and "return CreateZExt" inside the if() and else, and avoid this temporary. Then the first builder call in each clause can be assigned to a more meaningful name ("Result" is very generic).

This revision is now accepted and ready to land.Jun 24 2015, 3:50 PM

jholewinski accepted this revision.Jun 25 2015, 5:37 AM

jholewinski edited edge metadata.

Removed temp variable.

tra added inline comments.Jun 25 2015, 11:27 AM

lib/CodeGen/CGBuiltin.cpp
192	I've removed temp variable altogether and added comments explaining what the code does.

Closed by commit rL240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins. (authored by tra). · Explain WhyJun 25 2015, 11:29 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

CodeGen/

CGBuiltin.cpp

213 lines

CodeGenFunction.h

1 line

test/

CodeGen/

builtins-nvptx.c

122 lines

Diff 28268

lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	if (ResultType->isPointerTy())
return CGF.Builder.CreateIntToPtr(V, ResultType);		return CGF.Builder.CreateIntToPtr(V, ResultType);

assert(V->getType() == ResultType);		assert(V->getType() == ResultType);
return V;		return V;
}		}

/// Utility to insert an atomic instruction based on Instrinsic::ID		/// Utility to insert an atomic instruction based on Instrinsic::ID
/// and the expression node.		/// and the expression node.
static RValue EmitBinaryAtomic(CodeGenFunction &CGF,		static Value *MakeBinaryAtomicValue(CodeGenFunction &CGF,
llvm::AtomicRMWInst::BinOp Kind,		llvm::AtomicRMWInst::BinOp Kind,
const CallExpr *E) {		const CallExpr *E) {
QualType T = E->getType();		QualType T = E->getType();
assert(E->getArg(0)->getType()->isPointerType());		assert(E->getArg(0)->getType()->isPointerType());
assert(CGF.getContext().hasSameUnqualifiedType(T,		assert(CGF.getContext().hasSameUnqualifiedType(T,
E->getArg(0)->getType()->getPointeeType()));		E->getArg(0)->getType()->getPointeeType()));
assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));		assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));

llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));		llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();		unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =		llvm::IntegerType *IntType =
llvm::IntegerType::get(CGF.getLLVMContext(),		llvm::IntegerType::get(CGF.getLLVMContext(),
CGF.getContext().getTypeSize(T));		CGF.getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);		llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

llvm::Value *Args[2];		llvm::Value *Args[2];
Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);		Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = CGF.EmitScalarExpr(E->getArg(1));		Args[1] = CGF.EmitScalarExpr(E->getArg(1));
llvm::Type *ValueType = Args[1]->getType();		llvm::Type *ValueType = Args[1]->getType();
Args[1] = EmitToInt(CGF, Args[1], T, IntType);		Args[1] = EmitToInt(CGF, Args[1], T, IntType);

llvm::Value *Result =		llvm::Value *Result =
CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],		CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
llvm::SequentiallyConsistent);		llvm::SequentiallyConsistent);
Result = EmitFromInt(CGF, Result, T, ValueType);		Result = EmitFromInt(CGF, Result, T, ValueType);
		return Result;
		elibenUnsubmitted Not Done Reply Inline Actions Don't really need the Result temporary here? eliben: Don't really need the Result temporary here?
		traAuthorUnsubmitted Not Done Reply Inline Actions Done. tra: Done.
		}

		static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
		llvm::AtomicRMWInst::BinOp Kind,
		const CallExpr *E) {
		llvm::Value *Result = MakeBinaryAtomicValue(CGF, Kind, E);
		elibenUnsubmitted Not Done Reply Inline Actions You can just "return RValue::get(MakeBinary...)" -- similarly to how it's done below, for consistency eliben: You can just "return RValue::get(MakeBinary...)" -- similarly to how it's done below, for…
		traAuthorUnsubmitted Not Done Reply Inline Actions Done. tra: Done.
return RValue::get(Result);		return RValue::get(Result);
}		}

/// Utility to insert an atomic instruction based Instrinsic::ID and		/// Utility to insert an atomic instruction based Instrinsic::ID and
/// the expression node, where the return value is the result of the		/// the expression node, where the return value is the result of the
/// operation.		/// operation.
static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,		static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
llvm::AtomicRMWInst::BinOp Kind,		llvm::AtomicRMWInst::BinOp Kind,
Show All 26 Lines	static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);		Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
if (Invert)		if (Invert)
Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,		Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
llvm::ConstantInt::get(IntType, -1));		llvm::ConstantInt::get(IntType, -1));
Result = EmitFromInt(CGF, Result, T, ValueType);		Result = EmitFromInt(CGF, Result, T, ValueType);
return RValue::get(Result);		return RValue::get(Result);
}		}

		/// Utility to insert an atomic cmpxchg instruction based
		/// Instrinsic::ID and the expression node, where the return value is
		/// the result of the operation.
		static Value MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr E,
		bool ReturnBool) {
		elibenUnsubmitted Not Done Reply Inline Actions Please document all function parameters in the comment (especially ReturnBool) eliben: Please document all function parameters in the comment (especially ReturnBool)
		traAuthorUnsubmitted Not Done Reply Inline Actions Done. tra: Done.
		QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
		llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
		unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

		llvm::IntegerType *IntType = llvm::IntegerType::get(
		CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
		llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

		Value *Args[3];
		elibenUnsubmitted Not Done Reply Inline Actions Would universal initialization be simpler here? Value Args[3] = {CGF.Builder.CreateBi......, CGF.EmitScalar...} eliben:* Would universal initialization be simpler here? Value *Args[3] = {CGF.Builder.CreateBi......
		traAuthorUnsubmitted Not Done Reply Inline Actions Args[1] creation is a two-step operation which gets in a way. tra: Args[1] creation is a two-step operation which gets in a way.
		elibenUnsubmitted Not Done Reply Inline Actions You could obtain the type from Args[1] through the cast (getSrcTy) and avoid the awkward repeated assignment to Args[1]. But this isn't super important - up to you. eliben: You could obtain the type from Args[1] through the cast (getSrcTy) and avoid the awkward…
		Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
		Args[1] = CGF.EmitScalarExpr(E->getArg(1));
		llvm::Type *ValueType = Args[1]->getType();
		Args[1] = EmitToInt(CGF, Args[1], T, IntType);
		Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);

		Value *Pair = CGF.Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
		llvm::SequentiallyConsistent,
		llvm::SequentiallyConsistent);
		Value *Result;
		if (ReturnBool) {
		Result = CGF.Builder.CreateExtractValue(Pair, 1);
		// zext bool to int.
		Result = CGF.Builder.CreateZExt(Result, CGF.ConvertType(E->getType()));
		} else {
		Result = CGF.Builder.CreateExtractValue(Pair, 0);
		Result = EmitFromInt(CGF, Result, T, ValueType);
		}
		elibenUnsubmitted Not Done Reply Inline Actions I would just "return EmitFromInt" and "return CreateZExt" inside the if() and else, and avoid this temporary. Then the first builder call in each clause can be assigned to a more meaningful name ("Result" is very generic). eliben: I would just "return EmitFromInt" and "return CreateZExt" inside the if() and else, and avoid…
		traAuthorUnsubmitted Not Done Reply Inline Actions I've removed temp variable altogether and added comments explaining what the code does. tra: I've removed temp variable altogether and added comments explaining what the code does.
		return Result;
		}

/// EmitFAbs - Emit a call to @llvm.fabs().		/// EmitFAbs - Emit a call to @llvm.fabs().
static Value EmitFAbs(CodeGenFunction &CGF, Value V) {		static Value EmitFAbs(CodeGenFunction &CGF, Value V) {
Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());		Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);		llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
Call->setDoesNotAccessMemory();		Call->setDoesNotAccessMemory();
return Call;		return Call;
}		}

▲ Show 20 Lines • Show All 890 Lines • ▼ Show 20 Lines	RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__sync_nand_and_fetch_16:		case Builtin::BI__sync_nand_and_fetch_16:
return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,		return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
llvm::Instruction::And, true);		llvm::Instruction::And, true);

case Builtin::BI__sync_val_compare_and_swap_1:		case Builtin::BI__sync_val_compare_and_swap_1:
case Builtin::BI__sync_val_compare_and_swap_2:		case Builtin::BI__sync_val_compare_and_swap_2:
case Builtin::BI__sync_val_compare_and_swap_4:		case Builtin::BI__sync_val_compare_and_swap_4:
case Builtin::BI__sync_val_compare_and_swap_8:		case Builtin::BI__sync_val_compare_and_swap_8:
case Builtin::BI__sync_val_compare_and_swap_16: {		case Builtin::BI__sync_val_compare_and_swap_16:
QualType T = E->getType();		return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =
llvm::IntegerType::get(getLLVMContext(),
getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

Value *Args[3];
Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = EmitScalarExpr(E->getArg(1));
llvm::Type *ValueType = Args[1]->getType();
Args[1] = EmitToInt(*this, Args[1], T, IntType);
Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);

Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
llvm::SequentiallyConsistent,
llvm::SequentiallyConsistent);
Result = Builder.CreateExtractValue(Result, 0);
Result = EmitFromInt(*this, Result, T, ValueType);
return RValue::get(Result);
}

case Builtin::BI__sync_bool_compare_and_swap_1:		case Builtin::BI__sync_bool_compare_and_swap_1:
case Builtin::BI__sync_bool_compare_and_swap_2:		case Builtin::BI__sync_bool_compare_and_swap_2:
case Builtin::BI__sync_bool_compare_and_swap_4:		case Builtin::BI__sync_bool_compare_and_swap_4:
case Builtin::BI__sync_bool_compare_and_swap_8:		case Builtin::BI__sync_bool_compare_and_swap_8:
case Builtin::BI__sync_bool_compare_and_swap_16: {		case Builtin::BI__sync_bool_compare_and_swap_16:
QualType T = E->getArg(1)->getType();		return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =
llvm::IntegerType::get(getLLVMContext(),
getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

Value *Args[3];
Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = EmitToInt(*this, EmitScalarExpr(E->getArg(1)), T, IntType);
Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);

Value *Pair = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
llvm::SequentiallyConsistent,
llvm::SequentiallyConsistent);
Value *Result = Builder.CreateExtractValue(Pair, 1);
// zext bool to int.
Result = Builder.CreateZExt(Result, ConvertType(E->getType()));
return RValue::get(Result);
}

case Builtin::BI__sync_swap_1:		case Builtin::BI__sync_swap_1:
case Builtin::BI__sync_swap_2:		case Builtin::BI__sync_swap_2:
case Builtin::BI__sync_swap_4:		case Builtin::BI__sync_swap_4:
case Builtin::BI__sync_swap_8:		case Builtin::BI__sync_swap_8:
case Builtin::BI__sync_swap_16:		case Builtin::BI__sync_swap_16:
return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);		return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);

▲ Show 20 Lines • Show All 755 Lines • ▼ Show 20 Lines	Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
case llvm::Triple::ppc64:		case llvm::Triple::ppc64:
case llvm::Triple::ppc64le:		case llvm::Triple::ppc64le:
return EmitPPCBuiltinExpr(BuiltinID, E);		return EmitPPCBuiltinExpr(BuiltinID, E);
case llvm::Triple::r600:		case llvm::Triple::r600:
case llvm::Triple::amdgcn:		case llvm::Triple::amdgcn:
return EmitAMDGPUBuiltinExpr(BuiltinID, E);		return EmitAMDGPUBuiltinExpr(BuiltinID, E);
case llvm::Triple::systemz:		case llvm::Triple::systemz:
return EmitSystemZBuiltinExpr(BuiltinID, E);		return EmitSystemZBuiltinExpr(BuiltinID, E);
		case llvm::Triple::nvptx:
		case llvm::Triple::nvptx64:
		return EmitNVPTXBuiltinExpr(BuiltinID, E);
default:		default:
return nullptr;		return nullptr;
}		}
}		}

static llvm::VectorType GetNeonType(CodeGenFunction CGF,		static llvm::VectorType GetNeonType(CodeGenFunction CGF,
NeonTypeFlags TypeFlags,		NeonTypeFlags TypeFlags,
bool V1Ty=false) {		bool V1Ty=false) {
▲ Show 20 Lines • Show All 4,963 Lines • ▼ Show 20 Lines	#define INTRINSIC_WITH_CC(NAME) \
INTRINSIC_WITH_CC(s390_vftcidb);		INTRINSIC_WITH_CC(s390_vftcidb);

#undef INTRINSIC_WITH_CC		#undef INTRINSIC_WITH_CC

default:		default:
return nullptr;		return nullptr;
}		}
}		}

		Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
		elibenUnsubmitted Not Done Reply Inline Actions I'm wondering if there's some unwritten rule saying that all target builtins should be crammed into a single file... It's closing in on 7KLOC now and no end in sight. Would it be very bad for NVPTX to have its own CGBuiltinNVPTX or something like that? Clang splits classes to multiple files (Sema, CodeGenFunction, etc) already... Eric? eliben: I'm wondering if there's some unwritten rule saying that all target builtins should be crammed…
		const CallExpr *E) {
		switch (BuiltinID) {
		case NVPTX::BI__nvvm_atom_add_gen_i:
		case NVPTX::BI__nvvm_atom_add_gen_l:
		case NVPTX::BI__nvvm_atom_add_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);

		case NVPTX::BI__nvvm_atom_sub_gen_i:
		case NVPTX::BI__nvvm_atom_sub_gen_l:
		case NVPTX::BI__nvvm_atom_sub_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);

		case NVPTX::BI__nvvm_atom_and_gen_i:
		case NVPTX::BI__nvvm_atom_and_gen_l:
		case NVPTX::BI__nvvm_atom_and_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);

		case NVPTX::BI__nvvm_atom_or_gen_i:
		case NVPTX::BI__nvvm_atom_or_gen_l:
		case NVPTX::BI__nvvm_atom_or_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);

		case NVPTX::BI__nvvm_atom_xor_gen_i:
		case NVPTX::BI__nvvm_atom_xor_gen_l:
		case NVPTX::BI__nvvm_atom_xor_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);

		case NVPTX::BI__nvvm_atom_xchg_gen_i:
		case NVPTX::BI__nvvm_atom_xchg_gen_l:
		case NVPTX::BI__nvvm_atom_xchg_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);

		case NVPTX::BI__nvvm_atom_max_gen_i:
		case NVPTX::BI__nvvm_atom_max_gen_l:
		case NVPTX::BI__nvvm_atom_max_gen_ll:
		case NVPTX::BI__nvvm_atom_max_gen_ui:
		case NVPTX::BI__nvvm_atom_max_gen_ul:
		case NVPTX::BI__nvvm_atom_max_gen_ull:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);

		case NVPTX::BI__nvvm_atom_min_gen_i:
		case NVPTX::BI__nvvm_atom_min_gen_l:
		case NVPTX::BI__nvvm_atom_min_gen_ll:
		case NVPTX::BI__nvvm_atom_min_gen_ui:
		case NVPTX::BI__nvvm_atom_min_gen_ul:
		case NVPTX::BI__nvvm_atom_min_gen_ull:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);

		case NVPTX::BI__nvvm_atom_cas_gen_i:
		case NVPTX::BI__nvvm_atom_cas_gen_l:
		case NVPTX::BI__nvvm_atom_cas_gen_ll:
		return MakeAtomicCmpXchgValue(*this, E, true);

		case NVPTX::BI__nvvm_atom_add_gen_f: {
		Value *Ptr = EmitScalarExpr(E->getArg(0));
		Value *Val = EmitScalarExpr(E->getArg(1));
		// atomicrmw only deals with integer arguments so we need to use
		// LLVM's nvvm_atomic_load_add_f32 intrinsic for that.
		Value *FnALAF32 =
		CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f32, Ptr->getType());
		return Builder.CreateCall(FnALAF32, {Ptr, Val});
		}

		default:
		return nullptr;
		}
		}

lib/CodeGen/CodeGenFunction.h

Show First 20 Lines • Show All 2,580 Lines • ▼ Show 20 Lines	public:
llvm::Value vectorWrapScalar16(llvm::Value Op);		llvm::Value vectorWrapScalar16(llvm::Value Op);
llvm::Value EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr E);

llvm::Value BuildVector(ArrayRef<llvm::Value> Ops);		llvm::Value BuildVector(ArrayRef<llvm::Value> Ops);
llvm::Value EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr E);
		llvm::Value EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr E);

llvm::Value EmitObjCProtocolExpr(const ObjCProtocolExpr E);		llvm::Value EmitObjCProtocolExpr(const ObjCProtocolExpr E);
llvm::Value EmitObjCStringLiteral(const ObjCStringLiteral E);		llvm::Value EmitObjCStringLiteral(const ObjCStringLiteral E);
llvm::Value EmitObjCBoxedExpr(const ObjCBoxedExpr E);		llvm::Value EmitObjCBoxedExpr(const ObjCBoxedExpr E);
llvm::Value EmitObjCArrayLiteral(const ObjCArrayLiteral E);		llvm::Value EmitObjCArrayLiteral(const ObjCArrayLiteral E);
llvm::Value EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral E);		llvm::Value EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral E);
llvm::Value EmitObjCCollectionLiteral(const Expr E,		llvm::Value EmitObjCCollectionLiteral(const Expr E,
const ObjCMethodDecl *MethodWithObjects);		const ObjCMethodDecl *MethodWithObjects);
▲ Show 20 Lines • Show All 454 Lines • Show Last 20 Lines

test/CodeGen/builtins-nvptx.c

// REQUIRES: nvptx-registered-target		// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -S -emit-llvm -o - %s \| FileCheck %s		// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -S -emit-llvm -o - %s \| FileCheck %s		// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s

int read_tid() {		#define __device__ __attribute__((device))
		#define __global__ __attribute__((global))
		#define __shared__ __attribute__((shared))
		#define __constant__ __attribute__((constant))

		__device__ int read_tid() {

// CHECK: call i32 @llvm.ptx.read.tid.x()		// CHECK: call i32 @llvm.ptx.read.tid.x()
// CHECK: call i32 @llvm.ptx.read.tid.y()		// CHECK: call i32 @llvm.ptx.read.tid.y()
// CHECK: call i32 @llvm.ptx.read.tid.z()		// CHECK: call i32 @llvm.ptx.read.tid.z()
// CHECK: call i32 @llvm.ptx.read.tid.w()		// CHECK: call i32 @llvm.ptx.read.tid.w()

int x = __builtin_ptx_read_tid_x();		int x = __builtin_ptx_read_tid_x();
int y = __builtin_ptx_read_tid_y();		int y = __builtin_ptx_read_tid_y();
int z = __builtin_ptx_read_tid_z();		int z = __builtin_ptx_read_tid_z();
int w = __builtin_ptx_read_tid_w();		int w = __builtin_ptx_read_tid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ntid() {		__device__ int read_ntid() {

// CHECK: call i32 @llvm.ptx.read.ntid.x()		// CHECK: call i32 @llvm.ptx.read.ntid.x()
// CHECK: call i32 @llvm.ptx.read.ntid.y()		// CHECK: call i32 @llvm.ptx.read.ntid.y()
// CHECK: call i32 @llvm.ptx.read.ntid.z()		// CHECK: call i32 @llvm.ptx.read.ntid.z()
// CHECK: call i32 @llvm.ptx.read.ntid.w()		// CHECK: call i32 @llvm.ptx.read.ntid.w()

int x = __builtin_ptx_read_ntid_x();		int x = __builtin_ptx_read_ntid_x();
int y = __builtin_ptx_read_ntid_y();		int y = __builtin_ptx_read_ntid_y();
int z = __builtin_ptx_read_ntid_z();		int z = __builtin_ptx_read_ntid_z();
int w = __builtin_ptx_read_ntid_w();		int w = __builtin_ptx_read_ntid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ctaid() {		__device__ int read_ctaid() {

// CHECK: call i32 @llvm.ptx.read.ctaid.x()		// CHECK: call i32 @llvm.ptx.read.ctaid.x()
// CHECK: call i32 @llvm.ptx.read.ctaid.y()		// CHECK: call i32 @llvm.ptx.read.ctaid.y()
// CHECK: call i32 @llvm.ptx.read.ctaid.z()		// CHECK: call i32 @llvm.ptx.read.ctaid.z()
// CHECK: call i32 @llvm.ptx.read.ctaid.w()		// CHECK: call i32 @llvm.ptx.read.ctaid.w()

int x = __builtin_ptx_read_ctaid_x();		int x = __builtin_ptx_read_ctaid_x();
int y = __builtin_ptx_read_ctaid_y();		int y = __builtin_ptx_read_ctaid_y();
int z = __builtin_ptx_read_ctaid_z();		int z = __builtin_ptx_read_ctaid_z();
int w = __builtin_ptx_read_ctaid_w();		int w = __builtin_ptx_read_ctaid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_nctaid() {		__device__ int read_nctaid() {

// CHECK: call i32 @llvm.ptx.read.nctaid.x()		// CHECK: call i32 @llvm.ptx.read.nctaid.x()
// CHECK: call i32 @llvm.ptx.read.nctaid.y()		// CHECK: call i32 @llvm.ptx.read.nctaid.y()
// CHECK: call i32 @llvm.ptx.read.nctaid.z()		// CHECK: call i32 @llvm.ptx.read.nctaid.z()
// CHECK: call i32 @llvm.ptx.read.nctaid.w()		// CHECK: call i32 @llvm.ptx.read.nctaid.w()

int x = __builtin_ptx_read_nctaid_x();		int x = __builtin_ptx_read_nctaid_x();
int y = __builtin_ptx_read_nctaid_y();		int y = __builtin_ptx_read_nctaid_y();
int z = __builtin_ptx_read_nctaid_z();		int z = __builtin_ptx_read_nctaid_z();
int w = __builtin_ptx_read_nctaid_w();		int w = __builtin_ptx_read_nctaid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ids() {		__device__ int read_ids() {

// CHECK: call i32 @llvm.ptx.read.laneid()		// CHECK: call i32 @llvm.ptx.read.laneid()
// CHECK: call i32 @llvm.ptx.read.warpid()		// CHECK: call i32 @llvm.ptx.read.warpid()
// CHECK: call i32 @llvm.ptx.read.nwarpid()		// CHECK: call i32 @llvm.ptx.read.nwarpid()
// CHECK: call i32 @llvm.ptx.read.smid()		// CHECK: call i32 @llvm.ptx.read.smid()
// CHECK: call i32 @llvm.ptx.read.nsmid()		// CHECK: call i32 @llvm.ptx.read.nsmid()
// CHECK: call i32 @llvm.ptx.read.gridid()		// CHECK: call i32 @llvm.ptx.read.gridid()

int a = __builtin_ptx_read_laneid();		int a = __builtin_ptx_read_laneid();
int b = __builtin_ptx_read_warpid();		int b = __builtin_ptx_read_warpid();
int c = __builtin_ptx_read_nwarpid();		int c = __builtin_ptx_read_nwarpid();
int d = __builtin_ptx_read_smid();		int d = __builtin_ptx_read_smid();
int e = __builtin_ptx_read_nsmid();		int e = __builtin_ptx_read_nsmid();
int f = __builtin_ptx_read_gridid();		int f = __builtin_ptx_read_gridid();

return a + b + c + d + e + f;		return a + b + c + d + e + f;

}		}

int read_lanemasks() {		__device__ int read_lanemasks() {

// CHECK: call i32 @llvm.ptx.read.lanemask.eq()		// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
// CHECK: call i32 @llvm.ptx.read.lanemask.le()		// CHECK: call i32 @llvm.ptx.read.lanemask.le()
// CHECK: call i32 @llvm.ptx.read.lanemask.lt()		// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
// CHECK: call i32 @llvm.ptx.read.lanemask.ge()		// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
// CHECK: call i32 @llvm.ptx.read.lanemask.gt()		// CHECK: call i32 @llvm.ptx.read.lanemask.gt()

int a = __builtin_ptx_read_lanemask_eq();		int a = __builtin_ptx_read_lanemask_eq();
int b = __builtin_ptx_read_lanemask_le();		int b = __builtin_ptx_read_lanemask_le();
int c = __builtin_ptx_read_lanemask_lt();		int c = __builtin_ptx_read_lanemask_lt();
int d = __builtin_ptx_read_lanemask_ge();		int d = __builtin_ptx_read_lanemask_ge();
int e = __builtin_ptx_read_lanemask_gt();		int e = __builtin_ptx_read_lanemask_gt();

return a + b + c + d + e;		return a + b + c + d + e;

}		}

		__device__ long read_clocks() {
long read_clocks() {

// CHECK: call i32 @llvm.ptx.read.clock()		// CHECK: call i32 @llvm.ptx.read.clock()
// CHECK: call i64 @llvm.ptx.read.clock64()		// CHECK: call i64 @llvm.ptx.read.clock64()

int a = __builtin_ptx_read_clock();		int a = __builtin_ptx_read_clock();
long b = __builtin_ptx_read_clock64();		long b = __builtin_ptx_read_clock64();

return (long)a + b;		return (long)a + b;

}		}

int read_pms() {		__device__ int read_pms() {

// CHECK: call i32 @llvm.ptx.read.pm0()		// CHECK: call i32 @llvm.ptx.read.pm0()
// CHECK: call i32 @llvm.ptx.read.pm1()		// CHECK: call i32 @llvm.ptx.read.pm1()
// CHECK: call i32 @llvm.ptx.read.pm2()		// CHECK: call i32 @llvm.ptx.read.pm2()
// CHECK: call i32 @llvm.ptx.read.pm3()		// CHECK: call i32 @llvm.ptx.read.pm3()

int a = __builtin_ptx_read_pm0();		int a = __builtin_ptx_read_pm0();
int b = __builtin_ptx_read_pm1();		int b = __builtin_ptx_read_pm1();
int c = __builtin_ptx_read_pm2();		int c = __builtin_ptx_read_pm2();
int d = __builtin_ptx_read_pm3();		int d = __builtin_ptx_read_pm3();

return a + b + c + d;		return a + b + c + d;

}		}

void sync() {		__device__ void sync() {

// CHECK: call void @llvm.ptx.bar.sync(i32 0)		// CHECK: call void @llvm.ptx.bar.sync(i32 0)

__builtin_ptx_bar_sync(0);		__builtin_ptx_bar_sync(0);

}		}


// NVVM intrinsics		// NVVM intrinsics

// The idea is not to test all intrinsics, just that Clang is recognizing the		// The idea is not to test all intrinsics, just that Clang is recognizing the
// builtins defined in BuiltinsNVPTX.def		// builtins defined in BuiltinsNVPTX.def
void nvvm_math(float f1, float f2, double d1, double d2) {		__device__ void nvvm_math(float f1, float f2, double d1, double d2) {
// CHECK: call float @llvm.nvvm.fmax.f		// CHECK: call float @llvm.nvvm.fmax.f
float t1 = __nvvm_fmax_f(f1, f2);		float t1 = __nvvm_fmax_f(f1, f2);
// CHECK: call float @llvm.nvvm.fmin.f		// CHECK: call float @llvm.nvvm.fmin.f
float t2 = __nvvm_fmin_f(f1, f2);		float t2 = __nvvm_fmin_f(f1, f2);
// CHECK: call float @llvm.nvvm.sqrt.rn.f		// CHECK: call float @llvm.nvvm.sqrt.rn.f
float t3 = __nvvm_sqrt_rn_f(f1);		float t3 = __nvvm_sqrt_rn_f(f1);
// CHECK: call float @llvm.nvvm.rcp.rn.f		// CHECK: call float @llvm.nvvm.rcp.rn.f
float t4 = __nvvm_rcp_rn_f(f2);		float t4 = __nvvm_rcp_rn_f(f2);
Show All 13 Lines	// CHECK: call void @llvm.nvvm.membar.cta()
__nvvm_membar_cta();		__nvvm_membar_cta();
// CHECK: call void @llvm.nvvm.membar.gl()		// CHECK: call void @llvm.nvvm.membar.gl()
__nvvm_membar_gl();		__nvvm_membar_gl();
// CHECK: call void @llvm.nvvm.membar.sys()		// CHECK: call void @llvm.nvvm.membar.sys()
__nvvm_membar_sys();		__nvvm_membar_sys();
// CHECK: call void @llvm.nvvm.barrier0()		// CHECK: call void @llvm.nvvm.barrier0()
__nvvm_bar0();		__nvvm_bar0();
}		}

		__device__ int di;
		__shared__ int si;
		__device__ long dl;
		__shared__ long sl;
		__device__ long long dll;
		__shared__ long long sll;

		// Check for atomic intrinsics
		// CHECK-LABEL: nvvm_atom
		__device__ void nvvm_atom(float fp, float f, int ip, int i, long *lp, long l,
		long long *llp, long long ll) {
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_i(ip, i);
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_l(&dl, l);
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_ll(&sll, ll);

		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_i(ip, i);
		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_l(&dl, l);
		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_ll(&sll, ll);

		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_i(ip, i);
		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_l(&dl, l);
		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_ll(&sll, ll);

		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_i(ip, i);
		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_l(&dl, l);
		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_ll(&sll, ll);

		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_i(ip, i);
		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_l(&dl, l);
		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_ll(&sll, ll);

		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_i(ip, i);
		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_l(&dl, l);
		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_ll(&sll, ll);

		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_i(ip, i);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ui((unsigned int *)ip, i);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_l(&dl, l);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ll(&sll, ll);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);

		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_i(ip, i);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ui((unsigned int *)ip, i);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_l(&dl, l);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ll(&sll, ll);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);

		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_i(ip, 0, i);
		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_l(&dl, 0, l);
		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_ll(&sll, 0, ll);

		// CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
		__nvvm_atom_add_gen_f(fp, f);

		// CHECK: ret
		}