Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -481,7 +481,7 @@ TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom") BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n") BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n") -BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n") +TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", "satom") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom") TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom") Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -9580,6 +9580,16 @@ return Builder.CreateCall(FnALAF32, {Ptr, Val}); } + case NVPTX::BI__nvvm_atom_add_gen_d: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); + // atomicrmw only deals with integer arguments, so we need to use + // LLVM's nvvm_atomic_load_add_f64 intrinsic. + Value *FnALAF64 = + CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType()); + return Builder.CreateCall(FnALAF64, {Ptr, Val}); + } + case NVPTX::BI__nvvm_atom_inc_gen_ui: { Value *Ptr = EmitScalarExpr(E->getArg(0)); Value *Val = EmitScalarExpr(E->getArg(1)); Index: clang/test/CodeGen/builtins-nvptx-ptx50.cu =================================================================== --- /dev/null +++ clang/test/CodeGen/builtins-nvptx-ptx50.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s +// +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 \ +// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +// We have to keep all builtins that depend on particular target feature in the +// same function, because the codegen will stop after the very first function +// that encounters an error, so -verify will not be able to find errors in +// subsequent functions. + +// CHECK-LABEL: test_fn +__device__ void test_fn(double d, double* double_ptr) { + // CHECK: call double @llvm.nvvm.atomic.load.add.f64.p0f64 + // expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature satom}} + __nvvm_atom_add_gen_d(double_ptr, d); +} Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -683,10 +683,15 @@ Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>; -// Atomic not available as an llvm intrinsic. +// Atomics not available as llvm intrinsics. def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty], [LLVMAnyPointerType, llvm_float_ty], [IntrArgMemOnly, NoCapture<0>]>; + // Atomic add of f64 requires sm_60. + def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty], + [LLVMAnyPointerType, llvm_double_ty], + [IntrArgMemOnly, NoCapture<0>]>; + def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty], [LLVMAnyPointerType, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3449,6 +3449,7 @@ } case Intrinsic::nvvm_atomic_load_add_f32: + case Intrinsic::nvvm_atomic_load_add_f64: case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1095,6 +1095,12 @@ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; +def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; @@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; + // atom_sub def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), Index: llvm/test/CodeGen/NVPTX/atomics-sm60.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/atomics-sm60.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s + +; CHECK-LABEL .func test( +define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) { +; CHECK: atom.add.f64 + %r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d) +; CHECK: atom.global.add.f64 + %r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d) +; CHECK: atom.shared.add.f64 + %ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d) + ret void +} + +declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1 + +attributes #1 = { argmemonly nounwind }