Index: llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td @@ -729,6 +729,39 @@ [LLVMAnyPointerType, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; + class SCOPED_ATOMIC2_impl + : Intrinsic<[elty], + [LLVMAnyPointerType>, LLVMMatchType<0>], + [IntrArgMemOnly, NoCapture<0>]>; + class SCOPED_ATOMIC3_impl + : Intrinsic<[elty], + [LLVMAnyPointerType>, LLVMMatchType<0>, + LLVMMatchType<0>], + [IntrArgMemOnly, NoCapture<0>]>; + + multiclass PTXAtomicWithScope2 { + def _cta : SCOPED_ATOMIC2_impl; + def _sys : SCOPED_ATOMIC2_impl; + } + multiclass PTXAtomicWithScope3 { + def _cta : SCOPED_ATOMIC3_impl; + def _sys : SCOPED_ATOMIC3_impl; + } + multiclass PTXAtomicWithScope2_fi { + defm _f: PTXAtomicWithScope2; + defm _i: PTXAtomicWithScope2; + } + defm int_nvvm_atomic_add_gen : PTXAtomicWithScope2_fi; + defm int_nvvm_atomic_inc_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_dec_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_exch_gen_i: PTXAtomicWithScope2; + defm int_nvvm_atomic_xor_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_max_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_min_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_or_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_and_gen_i : PTXAtomicWithScope2; + defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3; + // Bar.Sync // The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the Index: llvm/trunk/lib/Target/NVPTX/NVPTX.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTX.td +++ llvm/trunk/lib/Target/NVPTX/NVPTX.td @@ -51,6 +51,9 @@ def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62", "Target SM 6.2">; +def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true", + "Atomic operations with scope">; + // PTX Versions def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", "Use PTX version 3.2">; @@ -81,9 +84,9 @@ def : Proc<"sm_50", [SM50, PTX40]>; def : Proc<"sm_52", [SM52, PTX41]>; def : Proc<"sm_53", [SM53, PTX42]>; -def : Proc<"sm_60", [SM60, PTX50]>; -def : Proc<"sm_61", [SM61, PTX50]>; -def : Proc<"sm_62", [SM62, PTX50]>; +def : Proc<"sm_60", [SM60, PTX50, SATOM]>; +def : Proc<"sm_61", [SM61, PTX50, SATOM]>; +def : Proc<"sm_62", [SM62, PTX50, SATOM]>; def NVPTXInstrInfo : InstrInfo { } Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3274,20 +3274,34 @@ return false; case Intrinsic::nvvm_atomic_load_add_f32: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::f32; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.vol = 0; - Info.readMem = true; - Info.writeMem = true; - Info.align = 0; - return true; - case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: { + auto &DL = I.getModule()->getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::i32; + Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; @@ -3295,6 +3309,7 @@ Info.writeMem = true; Info.align = 0; return true; + } case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -131,6 +131,10 @@ def useAtomRedG64forGen64 : Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; +def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; +def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; +def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; +def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1377,8 +1377,204 @@ defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3; +// Support for scoped atomic operations. Matches +// int_nvvm_atomic_{op}_{space}_{type}_{scope} +// and converts it into the appropriate instruction. +// NOTE: not all possible combinations are implemented +// 'space' is limited to generic as it's the only one needed to support CUDA. +// 'scope' = 'gpu' is default and is handled by regular atomic instructions. +class ATOM23_impl Preds, + dag ins, dag Operands> + : NVPTXInst<(outs regclass:$result), ins, + AsmStr, + [(set regclass:$result, Operands)]>, + Requires; + +// Define instruction variants for all addressing modes. +multiclass ATOM2P_impl Preds> { + let AddedComplexity = 1 in { + def : ATOM23_impl; + def : ATOM23_impl; + } + // tablegen can't infer argument types from Intrinsic (though it can + // from Instruction) so we have to enforce specific type on + // immediates via explicit cast to ImmTy. + def : ATOM23_impl; + def : ATOM23_impl; +} + +multiclass ATOM3P_impl Preds> { + // Variants for register/immediate permutations of $b and $c + let AddedComplexity = 2 in { + def : ATOM23_impl; + def : ATOM23_impl; + } + let AddedComplexity = 1 in { + def : ATOM23_impl; + def : ATOM23_impl; + def : ATOM23_impl; + def : ATOM23_impl; + } + def : ATOM23_impl; + def : ATOM23_impl; +} + +// Constructs instrinsic name and instruction asm strings. +multiclass ATOM2N_impl Preds> { + defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) + # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) + # "." # OpStr # "." # TypeStr + # " \t$result, [$src], $b;", + !cast( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)), + regclass, ImmType, Imm, ImmTy, Preds>; +} +multiclass ATOM3N_impl Preds> { + defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) + # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) + # "." # OpStr # "." # TypeStr + # " \t$result, [$src], $b, $c;", + !cast( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)), + regclass, ImmType, Imm, ImmTy, Preds>; +} + +// Constructs variants for different address spaces. +// For now we only need variants for generic space pointers. +multiclass ATOM2A_impl Preds> { + defm _gen_ : ATOM2N_impl; +} +multiclass ATOM3A_impl Preds> { + defm _gen_ : ATOM3N_impl; +} + +// Constructs variants for different scopes of atomic op. +multiclass ATOM2S_impl Preds> { + // .gpu scope is default and is currently covered by existing + // atomics w/o explicitly specified scope. + defm _cta : ATOM2A_impl; + defm _sys : ATOM2A_impl; +} +multiclass ATOM3S_impl Preds> { + // No need to define ".gpu"-scoped atomics. They do the same thing + // as the regular, non-scoped atomics defined elsewhere. + defm _cta : ATOM3A_impl; + defm _sys : ATOM3A_impl; +} +// atom.add +multiclass ATOM2_add_impl { + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; + defm _f32 : ATOM2S_impl; + defm _f64 : ATOM2S_impl; +} + +// atom.{and,or,xor} +multiclass ATOM2_bitwise_impl { + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; +} + +// atom.exch +multiclass ATOM2_exch_impl { + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; +} + +// atom.{min,max} +multiclass ATOM2_minmax_impl { + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _s64 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; +} + +// atom.{inc,dec} +multiclass ATOM2_incdec_impl { + defm _u32 : ATOM2S_impl; +} + +// atom.cas +multiclass ATOM3_cas_impl { + defm _b32 : ATOM3S_impl; + defm _b64 : ATOM3S_impl; +} +defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; +defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; +defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; +defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; +defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">; +defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; +defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; +defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; +defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; +defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; //----------------------------------- // Support for ldu on sm_20 or later Index: llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.h =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.h +++ llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.h @@ -48,6 +48,10 @@ // FrameLowering class because TargetFrameLowering is abstract. NVPTXFrameLowering FrameLowering; +protected: + // Processor supports scoped atomic operations. + bool HasAtomScope; + public: /// This constructor initializes the data members to match that /// of the specified module. @@ -77,6 +81,10 @@ bool hasAtomRedGen32() const { return SmVersion >= 20; } bool hasAtomRedGen64() const { return SmVersion >= 20; } bool hasAtomAddF32() const { return SmVersion >= 20; } + bool hasAtomAddF64() const { return SmVersion >= 60; } + bool hasAtomScope() const { return HasAtomScope; } + bool hasAtomBitwise64() const { return SmVersion >= 32; } + bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasVote() const { return SmVersion >= 12; } bool hasDouble() const { return SmVersion >= 13; } bool reqPTX20() const { return SmVersion >= 20; } Index: llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -29,8 +29,6 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { // Provide the default CPU if we don't have one. - if (CPU.empty() && FS.size()) - llvm_unreachable("we are not using FeatureStr"); TargetName = CPU.empty() ? "sm_20" : CPU; ParseSubtargetFeatures(TargetName, FS); Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -42,6 +42,29 @@ case Intrinsic::nvvm_atomic_load_add_f32: case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: return true; } } Index: llvm/trunk/test/CodeGen/NVPTX/atomics-with-scope.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/atomics-with-scope.ll +++ llvm/trunk/test/CodeGen/NVPTX/atomics-with-scope.ll @@ -0,0 +1,187 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s -check-prefixes=CHECK,CHECK32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s + +; CHECK-LABEL: .func test_atomics_scope( +define void @test_atomics_scope(float* %fp, float %f, + double* %dfp, double %df, + i32* %ip, i32 %i, + i32* %uip, i32 %ui, + i64* %llp, i64 %ll) #0 { +entry: +; CHECK: atom.cta.add.s32 + %tmp36 = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.add.u64 + %tmp38 = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.add.s32 + %tmp39 = tail call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.add.u64 + %tmp41 = tail call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.cta.add.f32 + %tmp42 = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float %f) +; CHECK: atom.cta.add.f64 + %tmp43 = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double %df) +; CHECK: atom.sys.add.f32 + %tmp44 = tail call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32(float* %fp, float %f) +; CHECK: atom.sys.add.f64 + %tmp45 = tail call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64(double* %dfp, double %df) + +; CHECK: atom.cta.exch.b32 + %tmp46 = tail call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.exch.b64 + %tmp48 = tail call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.exch.b32 + %tmp49 = tail call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.exch.b64 + %tmp51 = tail call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.max.s32 + %tmp52 = tail call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.max.s64 + %tmp56 = tail call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.max.s32 + %tmp58 = tail call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.max.s64 + %tmp62 = tail call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.min.s32 + %tmp64 = tail call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.min.s64 + %tmp68 = tail call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.min.s32 + %tmp70 = tail call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.min.s64 + %tmp74 = tail call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.inc.u32 + %tmp76 = tail call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.inc.u32 + %tmp77 = tail call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) + +; CHECK: atom.cta.dec.u32 + %tmp78 = tail call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.dec.u32 + %tmp79 = tail call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) + +; CHECK: atom.cta.and.b32 + %tmp80 = tail call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.and.b64 + %tmp82 = tail call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.and.b32 + %tmp83 = tail call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.and.b64 + %tmp85 = tail call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.or.b32 + %tmp86 = tail call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.or.b64 + %tmp88 = tail call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.or.b32 + %tmp89 = tail call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.or.b64 + %tmp91 = tail call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.xor.b32 + %tmp92 = tail call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.xor.b64 + %tmp94 = tail call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.sys.xor.b32 + %tmp95 = tail call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.sys.xor.b64 + %tmp97 = tail call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll) + +; CHECK: atom.cta.cas.b32 + %tmp98 = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 %i) +; CHECK: atom.cta.cas.b64 + %tmp100 = tail call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll, i64 %ll) +; CHECK: atom.sys.cas.b32 + %tmp101 = tail call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32(i32* %ip, i32 %i, i32 %i) +; CHECK: atom.sys.cas.b64 + %tmp103 = tail call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64(i64* %llp, i64 %ll, i64 %ll) + + ; CHECK: ret + ret void +} + +; Make sure we use constants as operands to our scoped atomic calls, where appropriate. +; CHECK-LABEL: .func test_atomics_scope_imm( +define void @test_atomics_scope_imm(float* %fp, float %f, + double* %dfp, double %df, + i32* %ip, i32 %i, + i32* %uip, i32 %ui, + i64* %llp, i64 %ll) #0 { + +; CHECK: atom.cta.add.s32{{.*}} %r{{[0-9]+}}; + %tmp1r = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 %i) +; CHECK: atom.cta.add.s32{{.*}}, 1; + %tmp1i = tail call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* %ip, i32 1) +; CHECK: atom.cta.add.u64{{.*}}, %rd{{[0-9]+}}; + %tmp2r = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 %ll) +; CHECK: atom.cta.add.u64{{.*}}, 2; + %tmp2i = tail call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* %llp, i64 2) + +; CHECK: atom.cta.add.f32{{.*}}, %f{{[0-9]+}}; + %tmp3r = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float %f) +; CHECK: atom.cta.add.f32{{.*}}, 0f40400000; + %tmp3i = tail call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* %fp, float 3.0) +; CHECK: atom.cta.add.f64{{.*}}, %fd{{[0-9]+}}; + %tmp4r = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double %df) +; CHECK: atom.cta.add.f64{{.*}}, 0d4010000000000000; + %tmp4i = tail call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* %dfp, double 4.0) + +; CAS is implemented separately and has more arguments +; CHECK: atom.cta.cas.b32{{.*}}], %r{{[0-9+]}}, %r{{[0-9+]}}; + %tmp5rr = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 %i) +; For some reason in 64-bit mode we end up passing 51 via a register. +; CHECK32: atom.cta.cas.b32{{.*}}], %r{{[0-9+]}}, 51; + %tmp5ri = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 %i, i32 51) +; CHECK: atom.cta.cas.b32{{.*}}], 52, %r{{[0-9+]}}; + %tmp5ir = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 52, i32 %i) +; CHECK: atom.cta.cas.b32{{.*}}], 53, 54; + %tmp5ii = tail call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* %ip, i32 53, i32 54) + + ; CHECK: ret + ret void +} + +declare i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32(float* nocapture, float) #1 +declare double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64(double* nocapture, double) #1 +declare float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32(float* nocapture, float) #1 +declare double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64(double* nocapture, double) #1 +declare i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32(i32* nocapture, i32) #1 +declare i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64(i64* nocapture, i64) #1 +declare i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32(i32* nocapture, i32, i32) #1 +declare i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64(i64* nocapture, i64, i64) #1 +declare i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32(i32* nocapture, i32, i32) #1 +declare i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64(i64* nocapture, i64, i64) #1 + +attributes #1 = { argmemonly nounwind }