diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h --- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h +++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h @@ -10,6 +10,7 @@ #define LLVM_CODEGEN_ATOMICEXPANDUTILS_H #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/AtomicOrdering.h" @@ -57,7 +58,9 @@ /// [...] /// /// Returns true if the containing function was modified. -bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg); +bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); } // end namespace llvm diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" @@ -48,6 +49,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LowerAtomic.h" #include #include @@ -62,6 +64,7 @@ class AtomicExpand : public FunctionPass { const TargetLowering *TLI = nullptr; const DataLayout *DL = nullptr; + SmallVector CmpXchgLoopBlocks; public: static char ID; // Pass identification, replacement for typeid @@ -72,6 +75,11 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + private: bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); @@ -104,7 +112,8 @@ IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); @@ -123,7 +132,8 @@ friend bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); }; // IRBuilder to be used for replacement atomic instructions. @@ -142,9 +152,12 @@ char &llvm::AtomicExpandID = AtomicExpand::ID; -INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false, - false) - +INITIALIZE_PASS_BEGIN(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } // Helper functions to retrieve the size of atomic instructions. @@ -190,6 +203,7 @@ return false; TLI = Subtarget->getTargetLowering(); DL = &F.getParent()->getDataLayout(); + CmpXchgLoopBlocks.clear(); SmallVector AtomicInsts; @@ -337,6 +351,22 @@ } else if (CASI) MadeChange |= tryExpandAtomicCmpXchg(CASI); } + + DominatorTreeWrapperPass *const DTW = + getAnalysisIfAvailable(); + DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, + DomTreeUpdater::UpdateStrategy::Eager); + auto TTI = &getAnalysis().getTTI(F); + for (BasicBlock *BB : CmpXchgLoopBlocks) { + simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr, + SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true)); + } return MadeChange; } @@ -602,7 +632,7 @@ << AI->getOperationName(AI->getOperation()) << " operation at " << MemScope << " memory scope"; }); - expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun, CmpXchgLoopBlocks); } return true; } @@ -873,7 +903,8 @@ if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment, MemOpOrder, SSID, - PerformPartwordOp, createCmpXchgInstFun); + PerformPartwordOp, createCmpXchgInstFun, + CmpXchgLoopBlocks); } else { assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, @@ -1479,7 +1510,8 @@ IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg) { + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks) { LLVMContext &Ctx = Builder.getContext(); BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); @@ -1501,8 +1533,9 @@ // [...] BasicBlock *ExitBB = BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + CmpXchgLoopBlocks.push_back(ExitBB); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - + CmpXchgLoopBlocks.push_back(LoopBB); // The split call above "helpfully" added a branch at the end of BB (to the // wrong place), but we want a load. It's easiest to just remove // the branch entirely. @@ -1559,8 +1592,9 @@ } // Note: This function is exposed externally by AtomicExpandUtils.h -bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, - CreateCmpXchgInstFun CreateCmpXchg) { +bool llvm::expandAtomicRMWToCmpXchg( + AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks) { ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout()); Builder.setIsFPConstrained( AI->getFunction()->hasFnAttribute(Attribute::StrictFP)); @@ -1574,7 +1608,7 @@ return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded, AI->getValOperand()); }, - CreateCmpXchg); + CreateCmpXchg, CmpXchgLoopBlocks); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -1722,9 +1756,10 @@ // CAS libcall, via a CAS loop, instead. if (!Success) { expandAtomicRMWToCmpXchg( - I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, - Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, - SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { + I, + [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, + Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, + SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { // Create the CAS instruction normally... AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, Alignment, MemOpOrder, @@ -1734,7 +1769,8 @@ // ...and then expand the CAS into a libcall. expandAtomicCASToLibcall(Pair); - }); + }, + CmpXchgLoopBlocks); } } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ /dev/null @@ -1,2929 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s - -declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: .LBB1_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB1_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB3_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: .LBB5_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB5_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ /dev/null @@ -1,3199 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s - -declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX8-NEXT: .LBB1_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB1_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB3_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX8-NEXT: .LBB5_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB5_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ /dev/null @@ -1,3199 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s - -declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX8-NEXT: .LBB1_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB1_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB3_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX8-NEXT: .LBB5_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB5_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 -; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 -; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 -; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 -; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ /dev/null @@ -1,3100 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s - -declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: .LBB1_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB1_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB1_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: buffer_wbinvl1_vol -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: buffer_gl0_inv -; GFX1064-DPP-NEXT: buffer_gl1_inv -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: buffer_gl0_inv -; GFX1032-DPP-NEXT: buffer_gl1_inv -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: buffer_gl0_inv -; GFX1164-DPP-NEXT: buffer_gl1_inv -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: buffer_gl0_inv -; GFX1132-DPP-NEXT: buffer_gl1_inv -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: .LBB3_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB3_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB3_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 -; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_3 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: -; GFX1132-DPP-NEXT: s_endpgm - %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { -; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: .LBB5_1: ; %ComputeLoop -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b32 s4, s3 -; GFX8-NEXT: s_ff1_i32_b32 s5, s2 -; GFX8-NEXT: s_add_i32 s4, s4, 32 -; GFX8-NEXT: s_min_u32 s4, s5, s4 -; GFX8-NEXT: v_readlane_b32 s6, v0, s4 -; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB5_5 -; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: .LBB5_1: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b32 s4, s3 -; GFX9-NEXT: s_ff1_i32_b32 s5, s2 -; GFX9-NEXT: s_add_i32 s4, s4, 32 -; GFX9-NEXT: s_min_u32 s4, s5, s4 -; GFX9-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 -; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 -; GFX1064-NEXT: s_add_i32 s4, s4, 32 -; GFX1064-NEXT: s_min_u32 s4, s5, s4 -; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 -; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 -; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 -; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 -; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc -; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 -; GFX1164-NEXT: s_add_i32 s4, s4, 32 -; GFX1164-NEXT: s_min_u32 s4, s5, s4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 -; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 -; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 -; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: -; GFX1164-NEXT: s_endpgm -; -; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 -; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 -; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 -; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: -; GFX1132-NEXT: s_endpgm -; -; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-DPP-NEXT: s_not_b64 exec, exec -; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 -; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc -; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: -; GFX9-DPP-NEXT: s_endpgm -; -; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: s_not_b64 exec, exec -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 -; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 -; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: -; GFX1064-DPP-NEXT: s_endpgm -; -; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc -; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: -; GFX1032-DPP-NEXT: s_endpgm -; -; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: s_not_b64 exec, exec -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: -; GFX1164-DPP-NEXT: s_endpgm -; -; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: -; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 -; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: -; GFX1132-DPP-NEXT: s_endpgm - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 - ret void -} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, float %in) #0 { +; GFX90A-LABEL: @divergent_cfg( +; GFX90A-NEXT: entry: +; GFX90A-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX90A-NEXT: [[D_CMP:%.*]] = icmp ult i32 [[TID]], 16 +; GFX90A-NEXT: br i1 [[D_CMP]], label [[IF:%.*]], label [[ELSE:%.*]] +; GFX90A: if: +; GFX90A-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(1) [[OUT:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP0]], [[IF]] ], [ [[TMP4:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[IN:%.*]] +; GFX90A-NEXT: [[TMP1:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[OUT]], i32 [[TMP2]], i32 [[TMP1]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +; GFX90A-NEXT: [[TMP4]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ENDIF:%.*]], label [[ATOMICRMW_START]] +; GFX90A: else: +; GFX90A-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(1) [[OUT]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START2:%.*]] +; GFX90A: atomicrmw.start2: +; GFX90A-NEXT: [[LOADED3:%.*]] = phi float [ [[TMP5]], [[ELSE]] ], [ [[TMP9:%.*]], [[ATOMICRMW_START2]] ] +; GFX90A-NEXT: [[NEW4:%.*]] = fadd float [[LOADED3]], [[IN]] +; GFX90A-NEXT: [[TMP6:%.*]] = bitcast float [[NEW4]] to i32 +; GFX90A-NEXT: [[TMP7:%.*]] = bitcast float [[LOADED3]] to i32 +; GFX90A-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[OUT]], i32 [[TMP7]], i32 [[TMP6]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS5:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; GFX90A-NEXT: [[NEWLOADED6:%.*]] = extractvalue { i32, i1 } [[TMP8]], 0 +; GFX90A-NEXT: [[TMP9]] = bitcast i32 [[NEWLOADED6]] to float +; GFX90A-NEXT: br i1 [[SUCCESS5]], label [[ENDIF]], label [[ATOMICRMW_START2]] +; GFX90A: endif: +; GFX90A-NEXT: [[COMBINE:%.*]] = phi float [ [[TMP4]], [[ATOMICRMW_START]] ], [ [[TMP9]], [[ATOMICRMW_START2]] ] +; GFX90A-NEXT: store float [[COMBINE]], ptr addrspace(1) [[OUT]], align 4 +; GFX90A-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %res_if = atomicrmw fadd ptr addrspace(1) %out, float %in seq_cst + br label %endif + +else: + %res_else = atomicrmw fadd ptr addrspace(1) %out, float %in seq_cst + br label %endif + +endif: + %combine = phi float [%res_if, %if], [%res_else, %else] + store float %combine, ptr addrspace(1) %out + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }