Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -442,21 +442,6 @@ [IntrNoMem, IntrSpeculatable] >; -// Fields should mirror atomicrmw -class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], - [llvm_anyptr_ty, - LLVMMatchType<0>, - llvm_i32_ty, // ordering - llvm_i32_ty, // scope - llvm_i1_ty], // isVolatile - [IntrArgMemOnly, IntrWillReturn, NoCapture>, - ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", - [SDNPMemOperand] ->; - -def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; -def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; - class AMDGPULDSIntrin : Intrinsic<[llvm_any_ty], [LLVMQualPointerType, 3>, Index: llvm/include/llvm/Support/AtomicOrdering.h =================================================================== --- llvm/include/llvm/Support/AtomicOrdering.h +++ llvm/include/llvm/Support/AtomicOrdering.h @@ -74,7 +74,8 @@ // is a valid AtomicOrdering. template inline bool isValidAtomicOrdering(Int I) { return static_cast(AtomicOrdering::NotAtomic) <= I && - I <= static_cast(AtomicOrdering::SequentiallyConsistent); + I <= static_cast(AtomicOrdering::SequentiallyConsistent) && + I != 3; } /// String used by LLVM IR to represent atomic ordering. Index: llvm/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/lib/IR/AutoUpgrade.cpp +++ llvm/lib/IR/AutoUpgrade.cpp @@ -814,16 +814,25 @@ Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1") return true; - if (Name == "amdgcn.alignbit") { + if (Name.startswith("amdgcn.")) + Name = Name.substr(7); // Strip off "amdgcn." + + if (Name == "alignbit") { // Target specific intrinsic became redundant NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr, {F->getReturnType()}); return true; } + if (Name.startswith("atomic.inc") || Name.startswith("atomic.dec")) { + // This was replaced with atomicrmw uinc_wrap and udec_wrap, so there's no + // new declaration. + NewFn = nullptr; + return true; + } + break; } - case 'c': { if (Name.startswith("ctlz.") && F->arg_size() == 1) { rename(F); @@ -2061,6 +2070,38 @@ llvm_unreachable("Unknown function for ARM CallBase upgrade."); } +static Value *UpgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, + Function *F, IRBuilder<> &Builder) { + const bool IsInc = Name.startswith("atomic.inc."); + if (IsInc || Name.startswith("atomic.dec.")) { + if (CI->getNumOperands() != 6) // Malformed bitcode. + return nullptr; + + AtomicRMWInst::BinOp RMWOp = + IsInc ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap; + + Value *Ptr = CI->getArgOperand(0); + Value *Val = CI->getArgOperand(1); + ConstantInt *OrderArg = dyn_cast(CI->getArgOperand(2)); + ConstantInt *VolatileArg = dyn_cast(CI->getArgOperand(4)); + + AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent; + if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue())) + Order = static_cast(OrderArg->getZExtValue()); + if (Order == AtomicOrdering::NotAtomic || + Order == AtomicOrdering::Unordered) + Order = AtomicOrdering::SequentiallyConsistent; + + AtomicRMWInst *RMW = Builder.CreateAtomicRMW(RMWOp, Ptr, Val, None, Order); + + if (!VolatileArg || !VolatileArg->isZero()) + RMW->setVolatile(true); + return RMW; + } + + llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade."); +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { @@ -2091,6 +2132,9 @@ bool IsARM = Name.startswith("arm."); if (IsARM) Name = Name.substr(4); + bool IsAMDGCN = Name.startswith("amdgcn."); + if (IsAMDGCN) + Name = Name.substr(7); if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); @@ -3913,6 +3957,8 @@ CI->getArgOperand(0), "h2f"); } else if (IsARM) { Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder); + } else if (IsAMDGCN) { + Rep = UpgradeAMDGCNIntrinsicCall(Name, CI, F, Builder); } else { llvm_unreachable("Unknown function for CallBase upgrade."); } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -198,9 +198,6 @@ bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const; - bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, - bool IsInc) const; - bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4623,20 +4623,6 @@ return true; } -bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, - MachineIRBuilder &B, - bool IsInc) const { - unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP : - AMDGPU::G_ATOMICRMW_UDEC_WRAP; - B.buildInstr(Opc) - .addDef(MI.getOperand(0).getReg()) - .addUse(MI.getOperand(2).getReg()) - .addUse(MI.getOperand(3).getReg()) - .cloneMemRefs(MI); - MI.eraseFromParent(); - return true; -} - static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { switch (IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_swap: @@ -5780,10 +5766,6 @@ case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - case Intrinsic::amdgcn_atomic_inc: - return legalizeAtomicIncDec(MI, B, true); - case Intrinsic::amdgcn_atomic_dec: - return legalizeAtomicIncDec(MI, B, false); case Intrinsic::trap: return legalizeTrapIntrinsic(MI, MRI, B); case Intrinsic::debugtrap: Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,8 +237,6 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -484,8 +484,6 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { switch (Inst->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: @@ -1009,8 +1007,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const { switch (IID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: @@ -1031,8 +1027,6 @@ Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1063,8 +1063,6 @@ } switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: @@ -1206,8 +1204,6 @@ SmallVectorImpl &Ops, Type *&AccessTy) const { switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_append: @@ -7321,8 +7317,6 @@ M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: { MemSDNode *M = cast(Op); unsigned Opc; @@ -7330,12 +7324,6 @@ case Intrinsic::amdgcn_ds_fadd: Opc = ISD::ATOMIC_LOAD_FADD; break; - case Intrinsic::amdgcn_atomic_inc: - Opc = ISD::ATOMIC_LOAD_UINC_WRAP; - break; - case Intrinsic::amdgcn_atomic_dec: - Opc = ISD::ATOMIC_LOAD_UDEC_WRAP; - break; } return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), @@ -7347,12 +7335,6 @@ MemSDNode *M = cast(Op); unsigned Opc; switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - Opc = ISD::ATOMIC_LOAD_UINC_WRAP; - break; - case Intrinsic::amdgcn_atomic_dec: - Opc = ISD::ATOMIC_LOAD_UDEC_WRAP; - break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; Index: llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll +++ llvm/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -16,39 +16,10 @@ ret void } -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_inc_i32(ptr addrspace(1) %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_inc_i64(ptr addrspace(1) %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_dec_i32(ptr addrspace(1) %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_dec_i64(ptr addrspace(1) %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1 - ; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 { %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) - store i32 %ret, ptr addrspace(1) %ptr, align 4 + store i32 %ret, i32 addrspace(1)* %ptr, align 4 ret void } Index: llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll +++ llvm/test/Analysis/LegacyDivergenceAnalysis/AMDGPU/atomics.ll @@ -15,37 +15,8 @@ ret void } -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_inc_i32(ptr addrspace(1) %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_inc_i64(ptr addrspace(1) %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @test_atomic_dec_i32(ptr addrspace(1) %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) -define i64 @test_atomic_dec_i64(ptr addrspace(1) %ptr, i64 %val) #0 { - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false) - ret i64 %ret -} - -declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1 -declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1 - ; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) -define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 { +define amdgpu_kernel void @test_atomic_csub_i32(i32 addrspace(1)* %ptr, i32 %val) #0 { %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) store i32 %ret, ptr addrspace(1) %ptr, align 4 ret void Index: llvm/test/Bitcode/amdgcn-atomic.ll =================================================================== --- /dev/null +++ llvm/test/Bitcode/amdgcn-atomic.ll @@ -0,0 +1,115 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + + +define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i64 48 seq_cst, align 8 + %result3 = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr0, i64 48, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i64 45 seq_cst, align 8 + %result4 = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr1, i64 45, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8 + %result5 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8 + %result6 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 true) + ret void +} + +define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i64 48 seq_cst, align 8 + %result3 = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr0, i64 48, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i64 45 seq_cst, align 8 + %result4 = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr1, i64 45, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8 + %result5 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw volatile udec_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8 + %result6 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 true) + ret void +} + +; Test some invalid ordering handling +define void @ordering(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true) + + ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false) + + ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 monotonic, align 4 + %result3 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true) + + ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result4 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false) + + ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true) + + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result6 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false) + + ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true) + + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result8 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false) + + ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result9 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true) + + ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4 + %result10 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true) + ret void +} + +define void @immarg_violations(ptr %ptr0, i32 %val32, i1 %val1) { + ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false) + +; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 monotonic, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false) + + ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 monotonic, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1) + ret void +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 + +declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 + +attributes #0 = { argmemonly nounwind willreturn } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -0,0 +1,2948 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s + +; FIXME: Merge with other test. DS offset folding doesn't work due to +; register bank copies, and no return optimization is missing. + +@lds0 = internal addrspace(3) global [512 x i32] undef +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32 addrspace(3)* %ptr, i32 42 seq_cst, align 4 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(3)* %gep, i32 42 seq_cst, align 4 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_u32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_u32 v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_u32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_u32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_u32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32 addrspace(3)* %ptr, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_u32 v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_u32 v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_u32 v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_u32 v1, v0 offset:16 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(3)* %gep, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32 addrspace(1)* %ptr, i32 42 seq_cst, align 4 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(1)* %gep, i32 42 seq_cst, align 4 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32 addrspace(1)* %ptr, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(1)* %gep, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i32 addrspace(1)* %gep, i32 42 seq_cst, align 4 + store i32 %result, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i32 addrspace(1)* %gep, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32* %ptr, i32 42 seq_cst, align 4 + store i32 %result, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s2, s2, 16 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = atomicrmw udec_wrap i32* %gep, i32 42 seq_cst, align 4 + store i32 %result, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i32* %ptr, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 16 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = atomicrmw udec_wrap i32* %gep, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v3, v[0:1], v3 offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v3, v[0:1], v3 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_store_b32 v[0:1], v3 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %out.gep = getelementptr i32, i32* %out, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i32* %gep, i32 42 seq_cst, align 4 + store i32 %result, i32* %out.gep, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i32* %gep, i32 42 seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64* %ptr, i64 42 seq_cst, align 8 + store i64 %result, i64* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s2, s2, 32 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = atomicrmw udec_wrap i64* %gep, i64 42 seq_cst, align 8 + store i64 %result, i64* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64* %ptr, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 32 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = atomicrmw udec_wrap i64* %gep, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %out.gep = getelementptr i64, i64* %out, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i64* %gep, i64 42 seq_cst, align 8 + store i64 %result, i64* %out.gep, align 4 + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #1 { +; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i64* %gep, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #1 { +; CI-LABEL: atomic_dec_shl_base_lds_0: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CI-NEXT: v_mov_b32_e32 v2, 9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_store_dword v[0:1], v3 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_dec_shl_base_lds_0: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 9 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-NEXT: global_store_dword v2, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_shl_base_lds_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %result = atomicrmw udec_wrap i32 addrspace(3)* %arrayidx0, i32 9 seq_cst, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64 addrspace(3)* %ptr, i64 42 seq_cst, align 8 + store i64 %result, i64 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i64 addrspace(3)* %gep, i64 42 seq_cst, align 8 + store i64 %result, i64 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_u64 v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_u64 v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_u64 v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_u64 v2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_u64 v2, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64 addrspace(3)* %ptr, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) #1 { +; CI-LABEL: lds_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_dec_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_dec_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_u64 v2, v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i64 addrspace(3)* %gep, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64 addrspace(1)* %ptr, i64 42 seq_cst, align 8 + store i64 %result, i64 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = atomicrmw udec_wrap i64 addrspace(1)* %gep, i64 42 seq_cst, align 8 + store i64 %result, i64 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %result = atomicrmw udec_wrap i64 addrspace(1)* %ptr, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = atomicrmw udec_wrap i64 addrspace(1)* %gep, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i64 addrspace(1)* %gep, i64 42 seq_cst, align 8 + store i64 %result, i64 addrspace(1)* %out.gep, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #1 { +; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = atomicrmw udec_wrap i64 addrspace(1)* %gep, i64 42 seq_cst, align 8 + ret void +} + +define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #1 { +; CI-LABEL: atomic_dec_shl_base_lds_0_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v1, 9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_store_dword v[3:4], v0 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, 9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_store_dword v[3:4], v0 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_dec_shl_base_lds_0_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 9 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dword v3, v0, s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %result = atomicrmw udec_wrap i64 addrspace(3)* %arrayidx0, i64 9 seq_cst, align 8 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + store i64 %result, i64 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } +attributes #2 = { nounwind memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -8,18 +8,12 @@ ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. +@lds0 = internal addrspace(3) global [512 x i32] undef, align 4 +@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 -declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32, i32, i1) #2 - -declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) nocapture, i64, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2 - -declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #0 -define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 @@ -28,10 +22,11 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -43,10 +38,11 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -57,9 +53,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -70,9 +67,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -83,22 +83,21 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 - store i32 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } -!0 = !{!1} -!1 = distinct !{!1, !2} -!2 = distinct !{!2} - -define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 @@ -107,10 +106,11 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -122,10 +122,11 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -136,9 +137,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -149,9 +151,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -162,19 +167,22 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -182,7 +190,9 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_u32 v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i32: @@ -192,7 +202,9 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_u32 v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i32: @@ -201,7 +213,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_u32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: @@ -210,7 +224,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_u32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lds_atomic_inc_noret_i32: @@ -218,13 +236,17 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_u32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -232,7 +254,9 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_u32 v1, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: @@ -242,7 +266,9 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_u32 v1, v0 offset:16 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: @@ -251,7 +277,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: @@ -260,7 +288,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: @@ -268,14 +300,18 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -283,10 +319,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -297,10 +335,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -309,9 +349,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -320,9 +361,12 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -330,18 +374,21 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -351,10 +398,12 @@ ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -367,10 +416,12 @@ ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -379,9 +430,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -390,9 +442,12 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -400,19 +455,22 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -420,7 +478,10 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32: @@ -430,7 +491,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32: @@ -438,8 +502,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32: @@ -447,23 +513,30 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -473,7 +546,10 @@ ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset: @@ -485,7 +561,10 @@ ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: @@ -493,8 +572,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset: @@ -502,24 +583,31 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -532,12 +620,14 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; @@ -553,12 +643,14 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; @@ -567,9 +659,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -578,9 +671,12 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -588,9 +684,12 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -598,12 +697,12 @@ %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr addrspace(1) %out.gep + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 + store i32 %result, ptr addrspace(1) %out.gep, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -616,7 +715,10 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: @@ -631,7 +733,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: @@ -639,8 +744,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: @@ -648,37 +755,43 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 ret void } -@lds0 = internal addrspace(3) global [512 x i32] undef, align 4 - -define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 -; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 @@ -693,9 +806,10 @@ ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 -; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -709,10 +823,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: global_store_dword v2, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -723,9 +838,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 9 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-NEXT: global_store_dword v2, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -735,24 +853,27 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 - %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false) - store i32 %idx.0, ptr addrspace(1) %add_use - store i32 %val0, ptr addrspace(1) %out + %arrayidx0 = getelementptr inbounds [512 x i32], pr addrspace(3) @lds0, i32 0, i32 %idx.0 + %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i32 9 seq_cst, align 4 + store i32 %idx.0, ptr addrspace(1) %add_use, align 4 + store i32 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 @@ -762,10 +883,11 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -778,10 +900,11 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -793,9 +916,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -807,9 +931,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -821,18 +948,21 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8 + store i64 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { +define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 @@ -842,10 +972,11 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -858,10 +989,11 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -873,9 +1005,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -887,9 +1020,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -901,19 +1037,22 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8 + store i64 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -922,7 +1061,9 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_u64 v2, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i64: @@ -933,7 +1074,9 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_u64 v2, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i64: @@ -943,7 +1086,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_u64 v2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i64: @@ -953,7 +1098,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_u64 v2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lds_atomic_inc_noret_i64: @@ -962,13 +1111,17 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_u64 v2, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) nounwind { +define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -977,7 +1130,9 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: @@ -988,7 +1143,9 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: @@ -998,7 +1155,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: @@ -1008,7 +1167,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: @@ -1017,14 +1180,18 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1033,10 +1200,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1048,10 +1217,12 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1061,9 +1232,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1073,9 +1245,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1084,18 +1259,21 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8 + store i64 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1106,10 +1284,12 @@ ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1123,10 +1303,12 @@ ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1136,9 +1318,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1148,9 +1331,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1159,19 +1345,22 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + store i64 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1180,7 +1369,10 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64: @@ -1191,7 +1383,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64: @@ -1200,8 +1395,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64: @@ -1210,8 +1407,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64: @@ -1219,15 +1420,18 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) nounwind { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1238,7 +1442,10 @@ ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: @@ -1251,7 +1458,10 @@ ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: @@ -1260,8 +1470,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: @@ -1270,8 +1482,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset: @@ -1279,16 +1495,19 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1302,12 +1521,14 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1324,12 +1545,14 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1339,9 +1562,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1351,9 +1575,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1362,9 +1589,12 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1372,12 +1602,12 @@ %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr addrspace(1) %out.gep + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 + store i64 %result, ptr addrspace(1) %out.gep, align 4 ret void } -define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 { +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1391,7 +1621,10 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: @@ -1407,7 +1640,10 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: @@ -1416,8 +1652,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: @@ -1426,8 +1664,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: @@ -1435,31 +1677,86 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 { -; GCN-LABEL: flat_atomic_inc_ret_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 42 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_atomic_inc v2, v[0:1], v2 glc -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: s_endpgm +define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { +; CI-LABEL: flat_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: @@ -1467,18 +1764,22 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr %out + %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4 + store i32 %result, ptr %out, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1488,10 +1789,12 @@ ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -1504,10 +1807,12 @@ ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1518,10 +1823,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; @@ -1534,10 +1841,14 @@ ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; @@ -1547,28 +1858,77 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr %out + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 + store i32 %result, ptr %out, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind { -; GCN-LABEL: flat_atomic_inc_noret_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 42 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_atomic_inc v[0:1], v2 -; GCN-NEXT: s_endpgm +define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { +; CI-LABEL: flat_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: @@ -1576,14 +1936,19 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1593,7 +1958,10 @@ ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: @@ -1605,7 +1973,10 @@ ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: @@ -1615,7 +1986,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: @@ -1627,7 +2001,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: @@ -1636,15 +2016,20 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1657,12 +2042,14 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; @@ -1678,12 +2065,14 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; @@ -1697,12 +2086,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm ; @@ -1718,12 +2109,16 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v3 ; GFX10-NEXT: s_endpgm ; @@ -1736,12 +2131,16 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b32 v[0:1], v3 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1749,12 +2148,12 @@ %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) - store i32 %result, ptr %out.gep + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 + store i32 %result, ptr %out.gep, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1767,7 +2166,10 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1782,7 +2184,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1795,7 +2200,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1810,7 +2218,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc v[0:1], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1823,19 +2237,22 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 ret void } -@lds1 = internal addrspace(3) global [512 x i64] undef, align 8 - -define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1843,6 +2260,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s3 @@ -1861,6 +2279,7 @@ ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 @@ -1878,10 +2297,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm @@ -1893,9 +2313,12 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: global_store_dword v3, v0, s[2:3] ; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; GFX10-NEXT: s_endpgm @@ -1906,38 +2329,96 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false) - store i32 %idx.0, ptr addrspace(1) %add_use - store i64 %val0, ptr addrspace(1) %out + %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i64 9 seq_cst, align 8 + store i32 %idx.0, ptr addrspace(1) %add_use, align 4 + store i64 %result, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 { -; GCN-LABEL: flat_atomic_inc_ret_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm +define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { +; CI-LABEL: flat_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: @@ -1946,18 +2427,22 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr %out + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8 + store i64 %result, ptr %out, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1968,10 +2453,12 @@ ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1985,10 +2472,12 @@ ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -2000,10 +2489,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2017,10 +2508,14 @@ ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm ; @@ -2031,29 +2526,81 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr %out + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + store i64 %result, ptr %out, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { -; GCN-LABEL: flat_atomic_inc_noret_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm +define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { +; CI-LABEL: flat_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64: ; GFX11: ; %bb.0: @@ -2062,14 +2609,19 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2080,7 +2632,10 @@ ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: @@ -2093,7 +2648,10 @@ ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: @@ -2104,7 +2662,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: @@ -2117,7 +2678,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: @@ -2127,15 +2694,20 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -2149,12 +2721,14 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -2171,12 +2745,14 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -2191,12 +2767,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2213,12 +2791,16 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm ; @@ -2233,12 +2815,16 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2246,12 +2832,12 @@ %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) - store i64 %result, ptr %out.gep + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + store i64 %result, ptr %out.gep, align 4 ret void } -define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 { +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2265,7 +2851,10 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -2281,7 +2870,10 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -2295,7 +2887,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -2311,7 +2906,13 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -2325,17 +2926,22 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 ret void } -define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #0 { +define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s6, s[4:5], 0x4 @@ -2344,16 +2950,17 @@ ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s6 -; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0 -; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: flat_store_dword v[0:1], v4 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: flat_store_dword v[2:3], v5 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: @@ -2364,45 +2971,54 @@ ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0 -; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 -; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: ds_inc_rtn_u32 v2, v1, v0 -; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm @@ -2411,25 +3027,33 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: ds_inc_rtn_u32 v2, v1, v0 -; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) - %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false) - - store i32 %result0, ptr addrspace(1) %out0 - store i32 %result1, ptr addrspace(1) %out1 + %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 + %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 + store i32 %result0, ptr addrspace(1) %out0, align 4 + store i32 %result1, ptr addrspace(1) %out1, align 4 ret void } -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind argmemonly } +attributes #0 = { nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } +attributes #2 = { nounwind memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} Index: llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -631,54 +631,6 @@ ret void } -; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32( -; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 -; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false) -define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { -entry: - %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 - %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 - %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %tmp0 = icmp eq i32 %tid, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false) - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, ptr addrspace(3) %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32( -; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28 -; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false) -define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) { -entry: - %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999 - %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7 - %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %tmp0 = icmp eq i32 %tid, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false) - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, ptr addrspace(3) %out.gep - br label %done - -done: - ret void -} - ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( ; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096 ; OPT-SICIV: br @@ -790,8 +742,6 @@ } declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 -declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3 declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3 Index: llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -430,8 +430,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_inc v2, v0, v0, s[0:1] glc +; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -439,7 +440,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false) + %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 monotonic %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -451,8 +452,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_dec v2, v0, v0, s[0:1] glc +; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -460,7 +462,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false) + %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 monotonic %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 store float 1.0, ptr addrspace(1) %p1 @@ -952,8 +954,6 @@ ret void } -declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg) -declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1), double) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1), double) declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -3360,9 +3360,6 @@ ; amdgcn atomic inc ; -------------------------------------------------------------------------------- -declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 -declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 - define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_inc_saddr_i32_rtn: ; GCN: ; %bb.0: @@ -3377,7 +3374,7 @@ ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3397,7 +3394,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3415,7 +3412,7 @@ ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic ret void } @@ -3433,7 +3430,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic ret void } @@ -3451,7 +3448,7 @@ ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3471,7 +3468,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3489,7 +3486,7 @@ ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic ret void } @@ -3507,7 +3504,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic ret void } @@ -3515,8 +3512,6 @@ ; amdgcn atomic dec ; -------------------------------------------------------------------------------- -declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 -declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_dec_saddr_i32_rtn: @@ -3532,7 +3527,7 @@ ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3552,7 +3547,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } @@ -3570,7 +3565,7 @@ ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic ret void } @@ -3588,7 +3583,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic ret void } @@ -3606,7 +3601,7 @@ ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3626,7 +3621,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false) + %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } @@ -3644,7 +3639,7 @@ ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic ret void } @@ -3662,7 +3657,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false) + %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic ret void } Index: llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll =================================================================== --- llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll +++ llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll @@ -16,121 +16,8 @@ ret i64 %val } -; CHECK-LABEL: @atomicinc_global_to_flat_i32( -; CHECK: call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %global.ptr, i32 %y, i32 0, i32 0, i1 false) -define i32 @atomicinc_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %y) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %cast, i32 %y, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK-LABEL: @atomicinc_group_to_flat_i32( -; CHECK: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %group.ptr, i32 %y, i32 0, i32 0, i1 false) -define i32 @atomicinc_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %y) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %cast, i32 %y, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK-LABEL: @atomicinc_global_to_flat_i64( -; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %global.ptr, i64 %y, i32 0, i32 0, i1 false) -define i64 @atomicinc_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK-LABEL: @atomicinc_group_to_flat_i64( -; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %group.ptr, i64 %y, i32 0, i32 0, i1 false) -define i64 @atomicinc_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK-LABEL: @atomicdec_global_to_flat_i32( -; CHECK: call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %global.ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @atomicdec_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %val) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK-LABEL: @atomicdec_group_to_flat_i32( -; CHECK: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %group.ptr, i32 %val, i32 0, i32 0, i1 false) -define i32 @atomicdec_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %val) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 false) - ret i32 %ret -} - -; CHECK-LABEL: @atomicdec_global_to_flat_i64( -; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %global.ptr, i64 %y, i32 0, i32 0, i1 false) -define i64 @atomicdec_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK-LABEL: @atomicdec_group_to_flat_i64( -; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %group.ptr, i64 %y, i32 0, i32 0, i1 false -define i64 @atomicdec_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false) - ret i64 %ret -} - -; CHECK-LABEL: @volatile_atomicinc_group_to_flat_i64( -; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) -define i64 @volatile_atomicinc_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) - ret i64 %ret -} - -; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i32( -; CHECK-NEXT: %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr -; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true) -define i32 @volatile_atomicdec_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %val) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true) - ret i32 %ret -} - -; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i32( -; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr -; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true) -define i32 @volatile_atomicdec_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %val) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true) - ret i32 %ret -} - -; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i64( -; CHECK-NEXT: %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) -define i64 @volatile_atomicdec_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) - ret i64 %ret -} - -; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i64( -; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr -; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) -define i64 @volatile_atomicdec_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 { - %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr - %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true) - ret i64 %ret -} - declare i32 @llvm.objectsize.i32.p0(ptr, i1, i1, i1) #1 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32, i32, i1) #2 -declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32, i32, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -87,87 +87,5 @@ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32( - -; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 -; OPT: br label %.lr.ph -; OPT: .lr.ph: -; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] -; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] -; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false) -; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) -; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 -define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { -bb: - %tmp = icmp sgt i32 %n, 0 - br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge - -.lr.ph.preheader: ; preds = %bb - br label %.lr.ph - -._crit_edge.loopexit: ; preds = %.lr.ph - br label %._crit_edge - -._crit_edge: ; preds = %._crit_edge.loopexit, %bb - ret void - -.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader - %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] - %tmp1 = add nuw nsw i32 %indvars.iv, 16383 - %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 - %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) - %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv - %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) - %tmp8 = add nsw i32 %tmp7, %tmp4 - atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst - %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 - %exitcond = icmp eq i32 %indvars.iv.next, %n - br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph -} - -; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32( -; OPT: .lr.ph.preheader: -; OPT: %scevgep2 = getelementptr i32, i32 addrspace(3)* %arg1, i32 16383 -; OPT: br label %.lr.ph -; OPT: .lr.ph: -; OPT: %lsr.iv3 = phi i32 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] -; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] -; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] -; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv3, i32 undef, i32 0, i32 0, i1 false) -; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) -; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv3, i32 1 -define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { -bb: - %tmp = icmp sgt i32 %n, 0 - br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge - -.lr.ph.preheader: ; preds = %bb - br label %.lr.ph - -._crit_edge.loopexit: ; preds = %.lr.ph - br label %._crit_edge - -._crit_edge: ; preds = %._crit_edge.loopexit, %bb - ret void - -.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader - %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] - %tmp1 = add nuw nsw i32 %indvars.iv, 16383 - %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 - %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) - %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv - %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) - %tmp8 = add nsw i32 %tmp7, %tmp4 - atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst - %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 - %exitcond = icmp eq i32 %indvars.iv.next, %n - br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph -} - -declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 - attributes #0 = { nounwind } attributes #1 = { nounwind argmemonly } Index: llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll =================================================================== --- llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -142,44 +142,6 @@ ret i64 %result } -declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) -define amdgpu_kernel void @invalid_atomic_inc(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %var, i1 %bool) { - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i32 %var - ; CHECK-NEXT: %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false) - %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false) - - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i32 %var - ; CHECK-NEXT: %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false) - %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false) - - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool) - %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool) - ret void -} - -declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) -define amdgpu_kernel void @invalid_atomic_dec(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %var, i1 %bool) { - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i32 %var - ; CHECK-NEXT: %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false) - %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false) - - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i32 %var - ; CHECK-NEXT: %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false) - %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false) - - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool) - %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool) - ret void -} - declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) define amdgpu_kernel void @test_div_scale_f32_val_undef_undef(ptr addrspace(1) %out) { ; CHECK: immarg operand has non-immediate parameter