Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -82,33 +82,40 @@ /// Utility to insert an atomic instruction based on Instrinsic::ID /// and the expression node. +static Value *MakeBinaryAtomicValue(CodeGenFunction &CGF, + llvm::AtomicRMWInst::BinOp Kind, + const CallExpr *E) { + QualType T = E->getType(); + assert(E->getArg(0)->getType()->isPointerType()); + assert(CGF.getContext().hasSameUnqualifiedType(T, + E->getArg(0)->getType()->getPointeeType())); + assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType())); + + llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0)); + unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace(); + + llvm::IntegerType *IntType = + llvm::IntegerType::get(CGF.getLLVMContext(), + CGF.getContext().getTypeSize(T)); + llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace); + + llvm::Value *Args[2]; + Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType); + Args[1] = CGF.EmitScalarExpr(E->getArg(1)); + llvm::Type *ValueType = Args[1]->getType(); + Args[1] = EmitToInt(CGF, Args[1], T, IntType); + + llvm::Value *Result = + CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1], + llvm::SequentiallyConsistent); + Result = EmitFromInt(CGF, Result, T, ValueType); + return Result; +} + static RValue EmitBinaryAtomic(CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E) { - QualType T = E->getType(); - assert(E->getArg(0)->getType()->isPointerType()); - assert(CGF.getContext().hasSameUnqualifiedType(T, - E->getArg(0)->getType()->getPointeeType())); - assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType())); - - llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0)); - unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace(); - - llvm::IntegerType *IntType = - llvm::IntegerType::get(CGF.getLLVMContext(), - CGF.getContext().getTypeSize(T)); - llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace); - - llvm::Value *Args[2]; - Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType); - Args[1] = CGF.EmitScalarExpr(E->getArg(1)); - llvm::Type *ValueType = Args[1]->getType(); - Args[1] = EmitToInt(CGF, Args[1], T, IntType); - - llvm::Value *Result = - CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1], - llvm::SequentiallyConsistent); - Result = EmitFromInt(CGF, Result, T, ValueType); + llvm::Value *Result = MakeBinaryAtomicValue(CGF, Kind, E); return RValue::get(Result); } @@ -151,6 +158,41 @@ return RValue::get(Result); } +/// Utility to insert an atomic cmpxchg instruction based +/// Instrinsic::ID and the expression node, where the return value is +/// the result of the operation. +static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, + bool ReturnBool) { + QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType(); + llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0)); + unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace(); + + llvm::IntegerType *IntType = llvm::IntegerType::get( + CGF.getLLVMContext(), CGF.getContext().getTypeSize(T)); + llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace); + + Value *Args[3]; + Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType); + Args[1] = CGF.EmitScalarExpr(E->getArg(1)); + llvm::Type *ValueType = Args[1]->getType(); + Args[1] = EmitToInt(CGF, Args[1], T, IntType); + Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType); + + Value *Pair = CGF.Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], + llvm::SequentiallyConsistent, + llvm::SequentiallyConsistent); + Value *Result; + if (ReturnBool) { + Result = CGF.Builder.CreateExtractValue(Pair, 1); + // zext bool to int. + Result = CGF.Builder.CreateZExt(Result, CGF.ConvertType(E->getType())); + } else { + Result = CGF.Builder.CreateExtractValue(Pair, 0); + Result = EmitFromInt(CGF, Result, T, ValueType); + } + return Result; +} + /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); @@ -1057,58 +1099,15 @@ case Builtin::BI__sync_val_compare_and_swap_2: case Builtin::BI__sync_val_compare_and_swap_4: case Builtin::BI__sync_val_compare_and_swap_8: - case Builtin::BI__sync_val_compare_and_swap_16: { - QualType T = E->getType(); - llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0)); - unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace(); - - llvm::IntegerType *IntType = - llvm::IntegerType::get(getLLVMContext(), - getContext().getTypeSize(T)); - llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace); - - Value *Args[3]; - Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType); - Args[1] = EmitScalarExpr(E->getArg(1)); - llvm::Type *ValueType = Args[1]->getType(); - Args[1] = EmitToInt(*this, Args[1], T, IntType); - Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType); - - Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], - llvm::SequentiallyConsistent, - llvm::SequentiallyConsistent); - Result = Builder.CreateExtractValue(Result, 0); - Result = EmitFromInt(*this, Result, T, ValueType); - return RValue::get(Result); - } + case Builtin::BI__sync_val_compare_and_swap_16: + return RValue::get(MakeAtomicCmpXchgValue(*this, E, false)); case Builtin::BI__sync_bool_compare_and_swap_1: case Builtin::BI__sync_bool_compare_and_swap_2: case Builtin::BI__sync_bool_compare_and_swap_4: case Builtin::BI__sync_bool_compare_and_swap_8: - case Builtin::BI__sync_bool_compare_and_swap_16: { - QualType T = E->getArg(1)->getType(); - llvm::Value *DestPtr = EmitScalarExpr(E->getArg(0)); - unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace(); - - llvm::IntegerType *IntType = - llvm::IntegerType::get(getLLVMContext(), - getContext().getTypeSize(T)); - llvm::Type *IntPtrType = IntType->getPointerTo(AddrSpace); - - Value *Args[3]; - Args[0] = Builder.CreateBitCast(DestPtr, IntPtrType); - Args[1] = EmitToInt(*this, EmitScalarExpr(E->getArg(1)), T, IntType); - Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType); - - Value *Pair = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], - llvm::SequentiallyConsistent, - llvm::SequentiallyConsistent); - Value *Result = Builder.CreateExtractValue(Pair, 1); - // zext bool to int. - Result = Builder.CreateZExt(Result, ConvertType(E->getType())); - return RValue::get(Result); - } + case Builtin::BI__sync_bool_compare_and_swap_16: + return RValue::get(MakeAtomicCmpXchgValue(*this, E, true)); case Builtin::BI__sync_swap_1: case Builtin::BI__sync_swap_2: @@ -1880,6 +1879,9 @@ return EmitAMDGPUBuiltinExpr(BuiltinID, E); case llvm::Triple::systemz: return EmitSystemZBuiltinExpr(BuiltinID, E); + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + return EmitNVPTXBuiltinExpr(BuiltinID, E); default: return nullptr; } @@ -6859,3 +6861,72 @@ return nullptr; } } + +Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + switch (BuiltinID) { + case NVPTX::BI__nvvm_atom_add_gen_i: + case NVPTX::BI__nvvm_atom_add_gen_l: + case NVPTX::BI__nvvm_atom_add_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E); + + case NVPTX::BI__nvvm_atom_sub_gen_i: + case NVPTX::BI__nvvm_atom_sub_gen_l: + case NVPTX::BI__nvvm_atom_sub_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E); + + case NVPTX::BI__nvvm_atom_and_gen_i: + case NVPTX::BI__nvvm_atom_and_gen_l: + case NVPTX::BI__nvvm_atom_and_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E); + + case NVPTX::BI__nvvm_atom_or_gen_i: + case NVPTX::BI__nvvm_atom_or_gen_l: + case NVPTX::BI__nvvm_atom_or_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E); + + case NVPTX::BI__nvvm_atom_xor_gen_i: + case NVPTX::BI__nvvm_atom_xor_gen_l: + case NVPTX::BI__nvvm_atom_xor_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E); + + case NVPTX::BI__nvvm_atom_xchg_gen_i: + case NVPTX::BI__nvvm_atom_xchg_gen_l: + case NVPTX::BI__nvvm_atom_xchg_gen_ll: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E); + + case NVPTX::BI__nvvm_atom_max_gen_i: + case NVPTX::BI__nvvm_atom_max_gen_l: + case NVPTX::BI__nvvm_atom_max_gen_ll: + case NVPTX::BI__nvvm_atom_max_gen_ui: + case NVPTX::BI__nvvm_atom_max_gen_ul: + case NVPTX::BI__nvvm_atom_max_gen_ull: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E); + + case NVPTX::BI__nvvm_atom_min_gen_i: + case NVPTX::BI__nvvm_atom_min_gen_l: + case NVPTX::BI__nvvm_atom_min_gen_ll: + case NVPTX::BI__nvvm_atom_min_gen_ui: + case NVPTX::BI__nvvm_atom_min_gen_ul: + case NVPTX::BI__nvvm_atom_min_gen_ull: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E); + + case NVPTX::BI__nvvm_atom_cas_gen_i: + case NVPTX::BI__nvvm_atom_cas_gen_l: + case NVPTX::BI__nvvm_atom_cas_gen_ll: + return MakeAtomicCmpXchgValue(*this, E, true); + + case NVPTX::BI__nvvm_atom_add_gen_f: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); + // atomicrmw only deals with integer arguments so we need to use + // LLVM's nvvm_atomic_load_add_f32 intrinsic for that. + Value *FnALAF32 = + CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f32, Ptr->getType()); + return Builder.CreateCall(FnALAF32, {Ptr, Val}); + } + + default: + return nullptr; + } +} Index: lib/CodeGen/CodeGenFunction.h =================================================================== --- lib/CodeGen/CodeGenFunction.h +++ lib/CodeGen/CodeGenFunction.h @@ -2586,6 +2586,7 @@ llvm::Value *EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitObjCProtocolExpr(const ObjCProtocolExpr *E); llvm::Value *EmitObjCStringLiteral(const ObjCStringLiteral *E); Index: test/CodeGen/builtins-nvptx.c =================================================================== --- test/CodeGen/builtins-nvptx.c +++ test/CodeGen/builtins-nvptx.c @@ -1,8 +1,13 @@ // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -triple nvptx-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s -int read_tid() { +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +__device__ int read_tid() { // CHECK: call i32 @llvm.ptx.read.tid.x() // CHECK: call i32 @llvm.ptx.read.tid.y() @@ -18,7 +23,7 @@ } -int read_ntid() { +__device__ int read_ntid() { // CHECK: call i32 @llvm.ptx.read.ntid.x() // CHECK: call i32 @llvm.ptx.read.ntid.y() @@ -34,7 +39,7 @@ } -int read_ctaid() { +__device__ int read_ctaid() { // CHECK: call i32 @llvm.ptx.read.ctaid.x() // CHECK: call i32 @llvm.ptx.read.ctaid.y() @@ -50,7 +55,7 @@ } -int read_nctaid() { +__device__ int read_nctaid() { // CHECK: call i32 @llvm.ptx.read.nctaid.x() // CHECK: call i32 @llvm.ptx.read.nctaid.y() @@ -66,7 +71,7 @@ } -int read_ids() { +__device__ int read_ids() { // CHECK: call i32 @llvm.ptx.read.laneid() // CHECK: call i32 @llvm.ptx.read.warpid() @@ -86,7 +91,7 @@ } -int read_lanemasks() { +__device__ int read_lanemasks() { // CHECK: call i32 @llvm.ptx.read.lanemask.eq() // CHECK: call i32 @llvm.ptx.read.lanemask.le() @@ -104,8 +109,7 @@ } - -long read_clocks() { +__device__ long read_clocks() { // CHECK: call i32 @llvm.ptx.read.clock() // CHECK: call i64 @llvm.ptx.read.clock64() @@ -117,7 +121,7 @@ } -int read_pms() { +__device__ int read_pms() { // CHECK: call i32 @llvm.ptx.read.pm0() // CHECK: call i32 @llvm.ptx.read.pm1() @@ -133,7 +137,7 @@ } -void sync() { +__device__ void sync() { // CHECK: call void @llvm.ptx.bar.sync(i32 0) @@ -146,7 +150,7 @@ // The idea is not to test all intrinsics, just that Clang is recognizing the // builtins defined in BuiltinsNVPTX.def -void nvvm_math(float f1, float f2, double d1, double d2) { +__device__ void nvvm_math(float f1, float f2, double d1, double d2) { // CHECK: call float @llvm.nvvm.fmax.f float t1 = __nvvm_fmax_f(f1, f2); // CHECK: call float @llvm.nvvm.fmin.f @@ -176,3 +180,95 @@ // CHECK: call void @llvm.nvvm.barrier0() __nvvm_bar0(); } + +__device__ int di; +__shared__ int si; +__device__ long dl; +__shared__ long sl; +__device__ long long dll; +__shared__ long long sll; + +// Check for atomic intrinsics +// CHECK-LABEL: nvvm_atom +__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l, + long long *llp, long long ll) { + // CHECK: atomicrmw add + __nvvm_atom_add_gen_i(ip, i); + // CHECK: atomicrmw add + __nvvm_atom_add_gen_l(&dl, l); + // CHECK: atomicrmw add + __nvvm_atom_add_gen_ll(&sll, ll); + + // CHECK: atomicrmw sub + __nvvm_atom_sub_gen_i(ip, i); + // CHECK: atomicrmw sub + __nvvm_atom_sub_gen_l(&dl, l); + // CHECK: atomicrmw sub + __nvvm_atom_sub_gen_ll(&sll, ll); + + // CHECK: atomicrmw and + __nvvm_atom_and_gen_i(ip, i); + // CHECK: atomicrmw and + __nvvm_atom_and_gen_l(&dl, l); + // CHECK: atomicrmw and + __nvvm_atom_and_gen_ll(&sll, ll); + + // CHECK: atomicrmw or + __nvvm_atom_or_gen_i(ip, i); + // CHECK: atomicrmw or + __nvvm_atom_or_gen_l(&dl, l); + // CHECK: atomicrmw or + __nvvm_atom_or_gen_ll(&sll, ll); + + // CHECK: atomicrmw xor + __nvvm_atom_xor_gen_i(ip, i); + // CHECK: atomicrmw xor + __nvvm_atom_xor_gen_l(&dl, l); + // CHECK: atomicrmw xor + __nvvm_atom_xor_gen_ll(&sll, ll); + + // CHECK: atomicrmw xchg + __nvvm_atom_xchg_gen_i(ip, i); + // CHECK: atomicrmw xchg + __nvvm_atom_xchg_gen_l(&dl, l); + // CHECK: atomicrmw xchg + __nvvm_atom_xchg_gen_ll(&sll, ll); + + // CHECK: atomicrmw max + __nvvm_atom_max_gen_i(ip, i); + // CHECK: atomicrmw max + __nvvm_atom_max_gen_ui((unsigned int *)ip, i); + // CHECK: atomicrmw max + __nvvm_atom_max_gen_l(&dl, l); + // CHECK: atomicrmw max + __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); + // CHECK: atomicrmw max + __nvvm_atom_max_gen_ll(&sll, ll); + // CHECK: atomicrmw max + __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); + + // CHECK: atomicrmw min + __nvvm_atom_min_gen_i(ip, i); + // CHECK: atomicrmw min + __nvvm_atom_min_gen_ui((unsigned int *)ip, i); + // CHECK: atomicrmw min + __nvvm_atom_min_gen_l(&dl, l); + // CHECK: atomicrmw min + __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); + // CHECK: atomicrmw min + __nvvm_atom_min_gen_ll(&sll, ll); + // CHECK: atomicrmw min + __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); + + // CHECK: cmpxchg + __nvvm_atom_cas_gen_i(ip, 0, i); + // CHECK: cmpxchg + __nvvm_atom_cas_gen_l(&dl, 0, l); + // CHECK: cmpxchg + __nvvm_atom_cas_gen_ll(&sll, 0, ll); + + // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32 + __nvvm_atom_add_gen_f(fp, f); + + // CHECK: ret +}