This is an archive of the discontinued LLVM Phabricator instance.

[CUDA] Implemented __nvvm_atom__gen_ builtins.
ClosedPublic

Authored by tra on Jun 23 2015, 11:20 AM.

Download Raw Diff

Details

Reviewers

jholewinski
eliben
echristo

Commits

rGd21e5c668464: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.
rC240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.
rL240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins.

Summary

Implemented __nvvm_atom_*_gen_* builtins.

Integer variants are implmented as atomicrmw or cmpxchg instructions.

Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.xxx LVVM intrinsic.

Diff Detail

Repository: rL LLVM

Event Timeline

tra updated this revision to Diff 28268.Jun 23 2015, 11:20 AM

tra retitled this revision from to [CUDA] Implemented __nvvm_atom_*_gen_* builtins..

tra updated this object.

tra edited the test plan for this revision. (Show Details)

tra added reviewers: jholewinski, eliben, echristo.

tra added a subscriber: Unknown Object (MLST).

eliben added inline comments.Jun 24 2015, 8:08 AM

lib/CodeGen/CGBuiltin.cpp
112 ↗	(On Diff #28268)	Don't really need the Result temporary here?
118 ↗	(On Diff #28268)	You can just "return RValue::get(MakeBinary...)" -- similarly to how it's done below, for consistency
165 ↗	(On Diff #28268)	Please document all function parameters in the comment (especially ReturnBool)
174 ↗	(On Diff #28268)	Would universal initialization be simpler here? Value *Args[3] = {CGF.Builder.CreateBi......, CGF.EmitScalar...}
6865 ↗	(On Diff #28268)	I'm wondering if there's some unwritten rule saying that all target builtins should be crammed into a single file... It's closing in on 7KLOC now and no end in sight. Would it be very bad for NVPTX to have its own CGBuiltinNVPTX or something like that? Clang splits classes to multiple files (Sema, CodeGenFunction, etc) already... Eric?

Addressed Eli's comments.

lib/CodeGen/CGBuiltin.cpp
112 ↗	(On Diff #28268)	Done.
118 ↗	(On Diff #28268)	Done.
165 ↗	(On Diff #28268)	Done.
174 ↗	(On Diff #28268)	Args[1] creation is a two-step operation which gets in a way.

eliben added inline comments.Jun 24 2015, 3:46 PM

lib/CodeGen/CGBuiltin.cpp
174 ↗	(On Diff #28268)	You could obtain the type from Args[1] through the cast (getSrcTy) and avoid the awkward repeated assignment to Args[1]. But this isn't super important - up to you.

LGTM

I'd move the NVPTX builtin emission code to its own file unless others have strong objections, but looks good otherwise.

lib/CodeGen/CGBuiltin.cpp
190 ↗	(On Diff #28371)	I would just "return EmitFromInt" and "return CreateZExt" inside the if() and else, and avoid this temporary. Then the first builder call in each clause can be assigned to a more meaningful name ("Result" is very generic).

This revision is now accepted and ready to land.Jun 24 2015, 3:50 PM

jholewinski accepted this revision.Jun 25 2015, 5:37 AM

jholewinski edited edge metadata.

Removed temp variable.

tra added inline comments.Jun 25 2015, 11:27 AM

lib/CodeGen/CGBuiltin.cpp
190 ↗	(On Diff #28371)	I've removed temp variable altogether and added comments explaining what the code does.

Closed by commit rL240669: [CUDA] Implemented __nvvm_atom_*_gen_* builtins. (authored by tra). · Explain WhyJun 25 2015, 11:29 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

cfe/

trunk/

lib/

CodeGen/

CGBuiltin.cpp

179 lines

CodeGenFunction.h

1 line

test/

CodeGen/

builtins-nvptx.c

122 lines

Diff 28483

cfe/trunk/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	if (ResultType->isPointerTy())
return CGF.Builder.CreateIntToPtr(V, ResultType);		return CGF.Builder.CreateIntToPtr(V, ResultType);

assert(V->getType() == ResultType);		assert(V->getType() == ResultType);
return V;		return V;
}		}

/// Utility to insert an atomic instruction based on Instrinsic::ID		/// Utility to insert an atomic instruction based on Instrinsic::ID
/// and the expression node.		/// and the expression node.
static RValue EmitBinaryAtomic(CodeGenFunction &CGF,		static Value *MakeBinaryAtomicValue(CodeGenFunction &CGF,
llvm::AtomicRMWInst::BinOp Kind,		llvm::AtomicRMWInst::BinOp Kind,
const CallExpr *E) {		const CallExpr *E) {
QualType T = E->getType();		QualType T = E->getType();
assert(E->getArg(0)->getType()->isPointerType());		assert(E->getArg(0)->getType()->isPointerType());
assert(CGF.getContext().hasSameUnqualifiedType(T,		assert(CGF.getContext().hasSameUnqualifiedType(T,
E->getArg(0)->getType()->getPointeeType()));		E->getArg(0)->getType()->getPointeeType()));
assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));		assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));

llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));		llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();		unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =		llvm::IntegerType *IntType =
llvm::IntegerType::get(CGF.getLLVMContext(),		llvm::IntegerType::get(CGF.getLLVMContext(),
CGF.getContext().getTypeSize(T));		CGF.getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);		llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

llvm::Value *Args[2];		llvm::Value *Args[2];
Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);		Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = CGF.EmitScalarExpr(E->getArg(1));		Args[1] = CGF.EmitScalarExpr(E->getArg(1));
llvm::Type *ValueType = Args[1]->getType();		llvm::Type *ValueType = Args[1]->getType();
Args[1] = EmitToInt(CGF, Args[1], T, IntType);		Args[1] = EmitToInt(CGF, Args[1], T, IntType);

llvm::Value *Result =		llvm::Value *Result =
CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],		CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1],
llvm::SequentiallyConsistent);		llvm::SequentiallyConsistent);
Result = EmitFromInt(CGF, Result, T, ValueType);		return EmitFromInt(CGF, Result, T, ValueType);
return RValue::get(Result);		}

		static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
		llvm::AtomicRMWInst::BinOp Kind,
		const CallExpr *E) {
		return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
}		}

/// Utility to insert an atomic instruction based Instrinsic::ID and		/// Utility to insert an atomic instruction based Instrinsic::ID and
/// the expression node, where the return value is the result of the		/// the expression node, where the return value is the result of the
/// operation.		/// operation.
static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,		static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
llvm::AtomicRMWInst::BinOp Kind,		llvm::AtomicRMWInst::BinOp Kind,
const CallExpr *E,		const CallExpr *E,
Show All 25 Lines	static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);		Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]);
if (Invert)		if (Invert)
Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,		Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
llvm::ConstantInt::get(IntType, -1));		llvm::ConstantInt::get(IntType, -1));
Result = EmitFromInt(CGF, Result, T, ValueType);		Result = EmitFromInt(CGF, Result, T, ValueType);
return RValue::get(Result);		return RValue::get(Result);
}		}

		/// @brief Utility to insert an atomic cmpxchg instruction.
		///
		/// @param CGF The current codegen function.
		/// @param E Builtin call expression to convert to cmpxchg.
		/// arg0 - address to operate on
		/// arg1 - value to compare with
		/// arg2 - new value
		/// @param ReturnBool Specifies whether to return success flag of
		/// cmpxchg result or the old value.
		///
		/// @returns result of cmpxchg, according to ReturnBool
		static Value MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr E,
		bool ReturnBool) {
		QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
		llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
		unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

		llvm::IntegerType *IntType = llvm::IntegerType::get(
		CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
		llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

		Value *Args[3];
		Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType);
		Args[1] = CGF.EmitScalarExpr(E->getArg(1));
		llvm::Type *ValueType = Args[1]->getType();
		Args[1] = EmitToInt(CGF, Args[1], T, IntType);
		Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);

		Value *Pair = CGF.Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
		llvm::SequentiallyConsistent,
		llvm::SequentiallyConsistent);
		if (ReturnBool)
		// Extract boolean success flag and zext it to int.
		return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
		CGF.ConvertType(E->getType()));
		else
		// Extract old value and emit it using the same type as compare value.
		return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
		ValueType);
		}

/// EmitFAbs - Emit a call to @llvm.fabs().		/// EmitFAbs - Emit a call to @llvm.fabs().
static Value EmitFAbs(CodeGenFunction &CGF, Value V) {		static Value EmitFAbs(CodeGenFunction &CGF, Value V) {
Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());		Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);		llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
Call->setDoesNotAccessMemory();		Call->setDoesNotAccessMemory();
return Call;		return Call;
}		}

▲ Show 20 Lines • Show All 890 Lines • ▼ Show 20 Lines	RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__sync_nand_and_fetch_16:		case Builtin::BI__sync_nand_and_fetch_16:
return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,		return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
llvm::Instruction::And, true);		llvm::Instruction::And, true);

case Builtin::BI__sync_val_compare_and_swap_1:		case Builtin::BI__sync_val_compare_and_swap_1:
case Builtin::BI__sync_val_compare_and_swap_2:		case Builtin::BI__sync_val_compare_and_swap_2:
case Builtin::BI__sync_val_compare_and_swap_4:		case Builtin::BI__sync_val_compare_and_swap_4:
case Builtin::BI__sync_val_compare_and_swap_8:		case Builtin::BI__sync_val_compare_and_swap_8:
case Builtin::BI__sync_val_compare_and_swap_16: {		case Builtin::BI__sync_val_compare_and_swap_16:
QualType T = E->getType();		return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =
llvm::IntegerType::get(getLLVMContext(),
getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

Value *Args[3];
Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = EmitScalarExpr(E->getArg(1));
llvm::Type *ValueType = Args[1]->getType();
Args[1] = EmitToInt(*this, Args[1], T, IntType);
Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);

Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
llvm::SequentiallyConsistent,
llvm::SequentiallyConsistent);
Result = Builder.CreateExtractValue(Result, 0);
Result = EmitFromInt(*this, Result, T, ValueType);
return RValue::get(Result);
}

case Builtin::BI__sync_bool_compare_and_swap_1:		case Builtin::BI__sync_bool_compare_and_swap_1:
case Builtin::BI__sync_bool_compare_and_swap_2:		case Builtin::BI__sync_bool_compare_and_swap_2:
case Builtin::BI__sync_bool_compare_and_swap_4:		case Builtin::BI__sync_bool_compare_and_swap_4:
case Builtin::BI__sync_bool_compare_and_swap_8:		case Builtin::BI__sync_bool_compare_and_swap_8:
case Builtin::BI__sync_bool_compare_and_swap_16: {		case Builtin::BI__sync_bool_compare_and_swap_16:
QualType T = E->getArg(1)->getType();		return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0));
unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();

llvm::IntegerType *IntType =
llvm::IntegerType::get(getLLVMContext(),
getContext().getTypeSize(T));
llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace);

Value *Args[3];
Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType);
Args[1] = EmitToInt(*this, EmitScalarExpr(E->getArg(1)), T, IntType);
Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);

Value *Pair = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
llvm::SequentiallyConsistent,
llvm::SequentiallyConsistent);
Value *Result = Builder.CreateExtractValue(Pair, 1);
// zext bool to int.
Result = Builder.CreateZExt(Result, ConvertType(E->getType()));
return RValue::get(Result);
}

case Builtin::BI__sync_swap_1:		case Builtin::BI__sync_swap_1:
case Builtin::BI__sync_swap_2:		case Builtin::BI__sync_swap_2:
case Builtin::BI__sync_swap_4:		case Builtin::BI__sync_swap_4:
case Builtin::BI__sync_swap_8:		case Builtin::BI__sync_swap_8:
case Builtin::BI__sync_swap_16:		case Builtin::BI__sync_swap_16:
return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);		return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);

▲ Show 20 Lines • Show All 755 Lines • ▼ Show 20 Lines	Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
case llvm::Triple::ppc64:		case llvm::Triple::ppc64:
case llvm::Triple::ppc64le:		case llvm::Triple::ppc64le:
return EmitPPCBuiltinExpr(BuiltinID, E);		return EmitPPCBuiltinExpr(BuiltinID, E);
case llvm::Triple::r600:		case llvm::Triple::r600:
case llvm::Triple::amdgcn:		case llvm::Triple::amdgcn:
return EmitAMDGPUBuiltinExpr(BuiltinID, E);		return EmitAMDGPUBuiltinExpr(BuiltinID, E);
case llvm::Triple::systemz:		case llvm::Triple::systemz:
return EmitSystemZBuiltinExpr(BuiltinID, E);		return EmitSystemZBuiltinExpr(BuiltinID, E);
		case llvm::Triple::nvptx:
		case llvm::Triple::nvptx64:
		return EmitNVPTXBuiltinExpr(BuiltinID, E);
default:		default:
return nullptr;		return nullptr;
}		}
}		}

static llvm::VectorType GetNeonType(CodeGenFunction CGF,		static llvm::VectorType GetNeonType(CodeGenFunction CGF,
NeonTypeFlags TypeFlags,		NeonTypeFlags TypeFlags,
bool V1Ty=false) {		bool V1Ty=false) {
▲ Show 20 Lines • Show All 5,005 Lines • ▼ Show 20 Lines	#define INTRINSIC_WITH_CC(NAME) \
INTRINSIC_WITH_CC(s390_vftcidb);		INTRINSIC_WITH_CC(s390_vftcidb);

#undef INTRINSIC_WITH_CC		#undef INTRINSIC_WITH_CC

default:		default:
return nullptr;		return nullptr;
}		}
}		}

		Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
		const CallExpr *E) {
		switch (BuiltinID) {
		case NVPTX::BI__nvvm_atom_add_gen_i:
		case NVPTX::BI__nvvm_atom_add_gen_l:
		case NVPTX::BI__nvvm_atom_add_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);

		case NVPTX::BI__nvvm_atom_sub_gen_i:
		case NVPTX::BI__nvvm_atom_sub_gen_l:
		case NVPTX::BI__nvvm_atom_sub_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);

		case NVPTX::BI__nvvm_atom_and_gen_i:
		case NVPTX::BI__nvvm_atom_and_gen_l:
		case NVPTX::BI__nvvm_atom_and_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);

		case NVPTX::BI__nvvm_atom_or_gen_i:
		case NVPTX::BI__nvvm_atom_or_gen_l:
		case NVPTX::BI__nvvm_atom_or_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);

		case NVPTX::BI__nvvm_atom_xor_gen_i:
		case NVPTX::BI__nvvm_atom_xor_gen_l:
		case NVPTX::BI__nvvm_atom_xor_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);

		case NVPTX::BI__nvvm_atom_xchg_gen_i:
		case NVPTX::BI__nvvm_atom_xchg_gen_l:
		case NVPTX::BI__nvvm_atom_xchg_gen_ll:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);

		case NVPTX::BI__nvvm_atom_max_gen_i:
		case NVPTX::BI__nvvm_atom_max_gen_l:
		case NVPTX::BI__nvvm_atom_max_gen_ll:
		case NVPTX::BI__nvvm_atom_max_gen_ui:
		case NVPTX::BI__nvvm_atom_max_gen_ul:
		case NVPTX::BI__nvvm_atom_max_gen_ull:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);

		case NVPTX::BI__nvvm_atom_min_gen_i:
		case NVPTX::BI__nvvm_atom_min_gen_l:
		case NVPTX::BI__nvvm_atom_min_gen_ll:
		case NVPTX::BI__nvvm_atom_min_gen_ui:
		case NVPTX::BI__nvvm_atom_min_gen_ul:
		case NVPTX::BI__nvvm_atom_min_gen_ull:
		return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);

		case NVPTX::BI__nvvm_atom_cas_gen_i:
		case NVPTX::BI__nvvm_atom_cas_gen_l:
		case NVPTX::BI__nvvm_atom_cas_gen_ll:
		return MakeAtomicCmpXchgValue(*this, E, true);

		case NVPTX::BI__nvvm_atom_add_gen_f: {
		Value *Ptr = EmitScalarExpr(E->getArg(0));
		Value *Val = EmitScalarExpr(E->getArg(1));
		// atomicrmw only deals with integer arguments so we need to use
		// LLVM's nvvm_atomic_load_add_f32 intrinsic for that.
		Value *FnALAF32 =
		CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f32, Ptr->getType());
		return Builder.CreateCall(FnALAF32, {Ptr, Val});
		}

		default:
		return nullptr;
		}
		}

cfe/trunk/lib/CodeGen/CodeGenFunction.h

Show First 20 Lines • Show All 2,594 Lines • ▼ Show 20 Lines	public:
llvm::Value vectorWrapScalar16(llvm::Value Op);		llvm::Value vectorWrapScalar16(llvm::Value Op);
llvm::Value EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr E);

llvm::Value BuildVector(ArrayRef<llvm::Value> Ops);		llvm::Value BuildVector(ArrayRef<llvm::Value> Ops);
llvm::Value EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr E);
llvm::Value EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr E);		llvm::Value EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr E);
		llvm::Value EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr E);

llvm::Value EmitObjCProtocolExpr(const ObjCProtocolExpr E);		llvm::Value EmitObjCProtocolExpr(const ObjCProtocolExpr E);
llvm::Value EmitObjCStringLiteral(const ObjCStringLiteral E);		llvm::Value EmitObjCStringLiteral(const ObjCStringLiteral E);
llvm::Value EmitObjCBoxedExpr(const ObjCBoxedExpr E);		llvm::Value EmitObjCBoxedExpr(const ObjCBoxedExpr E);
llvm::Value EmitObjCArrayLiteral(const ObjCArrayLiteral E);		llvm::Value EmitObjCArrayLiteral(const ObjCArrayLiteral E);
llvm::Value EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral E);		llvm::Value EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral E);
llvm::Value EmitObjCCollectionLiteral(const Expr E,		llvm::Value EmitObjCCollectionLiteral(const Expr E,
const ObjCMethodDecl *MethodWithObjects);		const ObjCMethodDecl *MethodWithObjects);
▲ Show 20 Lines • Show All 454 Lines • Show Last 20 Lines

cfe/trunk/test/CodeGen/builtins-nvptx.c

// REQUIRES: nvptx-registered-target		// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -S -emit-llvm -o - %s \| FileCheck %s		// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -S -emit-llvm -o - %s \| FileCheck %s		// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s

int read_tid() {		#define __device__ __attribute__((device))
		#define __global__ __attribute__((global))
		#define __shared__ __attribute__((shared))
		#define __constant__ __attribute__((constant))

		__device__ int read_tid() {

// CHECK: call i32 @llvm.ptx.read.tid.x()		// CHECK: call i32 @llvm.ptx.read.tid.x()
// CHECK: call i32 @llvm.ptx.read.tid.y()		// CHECK: call i32 @llvm.ptx.read.tid.y()
// CHECK: call i32 @llvm.ptx.read.tid.z()		// CHECK: call i32 @llvm.ptx.read.tid.z()
// CHECK: call i32 @llvm.ptx.read.tid.w()		// CHECK: call i32 @llvm.ptx.read.tid.w()

int x = __builtin_ptx_read_tid_x();		int x = __builtin_ptx_read_tid_x();
int y = __builtin_ptx_read_tid_y();		int y = __builtin_ptx_read_tid_y();
int z = __builtin_ptx_read_tid_z();		int z = __builtin_ptx_read_tid_z();
int w = __builtin_ptx_read_tid_w();		int w = __builtin_ptx_read_tid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ntid() {		__device__ int read_ntid() {

// CHECK: call i32 @llvm.ptx.read.ntid.x()		// CHECK: call i32 @llvm.ptx.read.ntid.x()
// CHECK: call i32 @llvm.ptx.read.ntid.y()		// CHECK: call i32 @llvm.ptx.read.ntid.y()
// CHECK: call i32 @llvm.ptx.read.ntid.z()		// CHECK: call i32 @llvm.ptx.read.ntid.z()
// CHECK: call i32 @llvm.ptx.read.ntid.w()		// CHECK: call i32 @llvm.ptx.read.ntid.w()

int x = __builtin_ptx_read_ntid_x();		int x = __builtin_ptx_read_ntid_x();
int y = __builtin_ptx_read_ntid_y();		int y = __builtin_ptx_read_ntid_y();
int z = __builtin_ptx_read_ntid_z();		int z = __builtin_ptx_read_ntid_z();
int w = __builtin_ptx_read_ntid_w();		int w = __builtin_ptx_read_ntid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ctaid() {		__device__ int read_ctaid() {

// CHECK: call i32 @llvm.ptx.read.ctaid.x()		// CHECK: call i32 @llvm.ptx.read.ctaid.x()
// CHECK: call i32 @llvm.ptx.read.ctaid.y()		// CHECK: call i32 @llvm.ptx.read.ctaid.y()
// CHECK: call i32 @llvm.ptx.read.ctaid.z()		// CHECK: call i32 @llvm.ptx.read.ctaid.z()
// CHECK: call i32 @llvm.ptx.read.ctaid.w()		// CHECK: call i32 @llvm.ptx.read.ctaid.w()

int x = __builtin_ptx_read_ctaid_x();		int x = __builtin_ptx_read_ctaid_x();
int y = __builtin_ptx_read_ctaid_y();		int y = __builtin_ptx_read_ctaid_y();
int z = __builtin_ptx_read_ctaid_z();		int z = __builtin_ptx_read_ctaid_z();
int w = __builtin_ptx_read_ctaid_w();		int w = __builtin_ptx_read_ctaid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_nctaid() {		__device__ int read_nctaid() {

// CHECK: call i32 @llvm.ptx.read.nctaid.x()		// CHECK: call i32 @llvm.ptx.read.nctaid.x()
// CHECK: call i32 @llvm.ptx.read.nctaid.y()		// CHECK: call i32 @llvm.ptx.read.nctaid.y()
// CHECK: call i32 @llvm.ptx.read.nctaid.z()		// CHECK: call i32 @llvm.ptx.read.nctaid.z()
// CHECK: call i32 @llvm.ptx.read.nctaid.w()		// CHECK: call i32 @llvm.ptx.read.nctaid.w()

int x = __builtin_ptx_read_nctaid_x();		int x = __builtin_ptx_read_nctaid_x();
int y = __builtin_ptx_read_nctaid_y();		int y = __builtin_ptx_read_nctaid_y();
int z = __builtin_ptx_read_nctaid_z();		int z = __builtin_ptx_read_nctaid_z();
int w = __builtin_ptx_read_nctaid_w();		int w = __builtin_ptx_read_nctaid_w();

return x + y + z + w;		return x + y + z + w;

}		}

int read_ids() {		__device__ int read_ids() {

// CHECK: call i32 @llvm.ptx.read.laneid()		// CHECK: call i32 @llvm.ptx.read.laneid()
// CHECK: call i32 @llvm.ptx.read.warpid()		// CHECK: call i32 @llvm.ptx.read.warpid()
// CHECK: call i32 @llvm.ptx.read.nwarpid()		// CHECK: call i32 @llvm.ptx.read.nwarpid()
// CHECK: call i32 @llvm.ptx.read.smid()		// CHECK: call i32 @llvm.ptx.read.smid()
// CHECK: call i32 @llvm.ptx.read.nsmid()		// CHECK: call i32 @llvm.ptx.read.nsmid()
// CHECK: call i32 @llvm.ptx.read.gridid()		// CHECK: call i32 @llvm.ptx.read.gridid()

int a = __builtin_ptx_read_laneid();		int a = __builtin_ptx_read_laneid();
int b = __builtin_ptx_read_warpid();		int b = __builtin_ptx_read_warpid();
int c = __builtin_ptx_read_nwarpid();		int c = __builtin_ptx_read_nwarpid();
int d = __builtin_ptx_read_smid();		int d = __builtin_ptx_read_smid();
int e = __builtin_ptx_read_nsmid();		int e = __builtin_ptx_read_nsmid();
int f = __builtin_ptx_read_gridid();		int f = __builtin_ptx_read_gridid();

return a + b + c + d + e + f;		return a + b + c + d + e + f;

}		}

int read_lanemasks() {		__device__ int read_lanemasks() {

// CHECK: call i32 @llvm.ptx.read.lanemask.eq()		// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
// CHECK: call i32 @llvm.ptx.read.lanemask.le()		// CHECK: call i32 @llvm.ptx.read.lanemask.le()
// CHECK: call i32 @llvm.ptx.read.lanemask.lt()		// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
// CHECK: call i32 @llvm.ptx.read.lanemask.ge()		// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
// CHECK: call i32 @llvm.ptx.read.lanemask.gt()		// CHECK: call i32 @llvm.ptx.read.lanemask.gt()

int a = __builtin_ptx_read_lanemask_eq();		int a = __builtin_ptx_read_lanemask_eq();
int b = __builtin_ptx_read_lanemask_le();		int b = __builtin_ptx_read_lanemask_le();
int c = __builtin_ptx_read_lanemask_lt();		int c = __builtin_ptx_read_lanemask_lt();
int d = __builtin_ptx_read_lanemask_ge();		int d = __builtin_ptx_read_lanemask_ge();
int e = __builtin_ptx_read_lanemask_gt();		int e = __builtin_ptx_read_lanemask_gt();

return a + b + c + d + e;		return a + b + c + d + e;

}		}

		__device__ long read_clocks() {
long read_clocks() {

// CHECK: call i32 @llvm.ptx.read.clock()		// CHECK: call i32 @llvm.ptx.read.clock()
// CHECK: call i64 @llvm.ptx.read.clock64()		// CHECK: call i64 @llvm.ptx.read.clock64()

int a = __builtin_ptx_read_clock();		int a = __builtin_ptx_read_clock();
long b = __builtin_ptx_read_clock64();		long b = __builtin_ptx_read_clock64();

return (long)a + b;		return (long)a + b;

}		}

int read_pms() {		__device__ int read_pms() {

// CHECK: call i32 @llvm.ptx.read.pm0()		// CHECK: call i32 @llvm.ptx.read.pm0()
// CHECK: call i32 @llvm.ptx.read.pm1()		// CHECK: call i32 @llvm.ptx.read.pm1()
// CHECK: call i32 @llvm.ptx.read.pm2()		// CHECK: call i32 @llvm.ptx.read.pm2()
// CHECK: call i32 @llvm.ptx.read.pm3()		// CHECK: call i32 @llvm.ptx.read.pm3()

int a = __builtin_ptx_read_pm0();		int a = __builtin_ptx_read_pm0();
int b = __builtin_ptx_read_pm1();		int b = __builtin_ptx_read_pm1();
int c = __builtin_ptx_read_pm2();		int c = __builtin_ptx_read_pm2();
int d = __builtin_ptx_read_pm3();		int d = __builtin_ptx_read_pm3();

return a + b + c + d;		return a + b + c + d;

}		}

void sync() {		__device__ void sync() {

// CHECK: call void @llvm.ptx.bar.sync(i32 0)		// CHECK: call void @llvm.ptx.bar.sync(i32 0)

__builtin_ptx_bar_sync(0);		__builtin_ptx_bar_sync(0);

}		}


// NVVM intrinsics		// NVVM intrinsics

// The idea is not to test all intrinsics, just that Clang is recognizing the		// The idea is not to test all intrinsics, just that Clang is recognizing the
// builtins defined in BuiltinsNVPTX.def		// builtins defined in BuiltinsNVPTX.def
void nvvm_math(float f1, float f2, double d1, double d2) {		__device__ void nvvm_math(float f1, float f2, double d1, double d2) {
// CHECK: call float @llvm.nvvm.fmax.f		// CHECK: call float @llvm.nvvm.fmax.f
float t1 = __nvvm_fmax_f(f1, f2);		float t1 = __nvvm_fmax_f(f1, f2);
// CHECK: call float @llvm.nvvm.fmin.f		// CHECK: call float @llvm.nvvm.fmin.f
float t2 = __nvvm_fmin_f(f1, f2);		float t2 = __nvvm_fmin_f(f1, f2);
// CHECK: call float @llvm.nvvm.sqrt.rn.f		// CHECK: call float @llvm.nvvm.sqrt.rn.f
float t3 = __nvvm_sqrt_rn_f(f1);		float t3 = __nvvm_sqrt_rn_f(f1);
// CHECK: call float @llvm.nvvm.rcp.rn.f		// CHECK: call float @llvm.nvvm.rcp.rn.f
float t4 = __nvvm_rcp_rn_f(f2);		float t4 = __nvvm_rcp_rn_f(f2);
Show All 13 Lines	// CHECK: call void @llvm.nvvm.membar.cta()
__nvvm_membar_cta();		__nvvm_membar_cta();
// CHECK: call void @llvm.nvvm.membar.gl()		// CHECK: call void @llvm.nvvm.membar.gl()
__nvvm_membar_gl();		__nvvm_membar_gl();
// CHECK: call void @llvm.nvvm.membar.sys()		// CHECK: call void @llvm.nvvm.membar.sys()
__nvvm_membar_sys();		__nvvm_membar_sys();
// CHECK: call void @llvm.nvvm.barrier0()		// CHECK: call void @llvm.nvvm.barrier0()
__nvvm_bar0();		__nvvm_bar0();
}		}

		__device__ int di;
		__shared__ int si;
		__device__ long dl;
		__shared__ long sl;
		__device__ long long dll;
		__shared__ long long sll;

		// Check for atomic intrinsics
		// CHECK-LABEL: nvvm_atom
		__device__ void nvvm_atom(float fp, float f, int ip, int i, long *lp, long l,
		long long *llp, long long ll) {
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_i(ip, i);
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_l(&dl, l);
		// CHECK: atomicrmw add
		__nvvm_atom_add_gen_ll(&sll, ll);

		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_i(ip, i);
		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_l(&dl, l);
		// CHECK: atomicrmw sub
		__nvvm_atom_sub_gen_ll(&sll, ll);

		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_i(ip, i);
		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_l(&dl, l);
		// CHECK: atomicrmw and
		__nvvm_atom_and_gen_ll(&sll, ll);

		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_i(ip, i);
		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_l(&dl, l);
		// CHECK: atomicrmw or
		__nvvm_atom_or_gen_ll(&sll, ll);

		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_i(ip, i);
		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_l(&dl, l);
		// CHECK: atomicrmw xor
		__nvvm_atom_xor_gen_ll(&sll, ll);

		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_i(ip, i);
		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_l(&dl, l);
		// CHECK: atomicrmw xchg
		__nvvm_atom_xchg_gen_ll(&sll, ll);

		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_i(ip, i);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ui((unsigned int *)ip, i);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_l(&dl, l);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ll(&sll, ll);
		// CHECK: atomicrmw max
		__nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);

		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_i(ip, i);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ui((unsigned int *)ip, i);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_l(&dl, l);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ll(&sll, ll);
		// CHECK: atomicrmw min
		__nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);

		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_i(ip, 0, i);
		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_l(&dl, 0, l);
		// CHECK: cmpxchg
		__nvvm_atom_cas_gen_ll(&sll, 0, ll);

		// CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
		__nvvm_atom_add_gen_f(fp, f);

		// CHECK: ret
		}