diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h --- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h +++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h @@ -10,6 +10,7 @@ #define LLVM_CODEGEN_ATOMICEXPANDUTILS_H #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/AtomicOrdering.h" @@ -57,7 +58,9 @@ /// [...] /// /// Returns true if the containing function was modified. -bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg); +bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); } // end namespace llvm diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" @@ -48,6 +49,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LowerAtomic.h" #include #include @@ -62,6 +64,7 @@ class AtomicExpand : public FunctionPass { const TargetLowering *TLI = nullptr; const DataLayout *DL = nullptr; + SmallVector CmpXchgLoopBlocks; public: static char ID; // Pass identification, replacement for typeid @@ -72,6 +75,11 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + private: bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); @@ -104,7 +112,8 @@ IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); @@ -123,7 +132,8 @@ friend bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks); }; // IRBuilder to be used for replacement atomic instructions. @@ -142,9 +152,12 @@ char &llvm::AtomicExpandID = AtomicExpand::ID; -INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false, - false) - +INITIALIZE_PASS_BEGIN(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } // Helper functions to retrieve the size of atomic instructions. @@ -190,6 +203,7 @@ return false; TLI = Subtarget->getTargetLowering(); DL = &F.getParent()->getDataLayout(); + CmpXchgLoopBlocks.clear(); SmallVector AtomicInsts; @@ -337,6 +351,22 @@ } else if (CASI) MadeChange |= tryExpandAtomicCmpXchg(CASI); } + + DominatorTreeWrapperPass *const DTW = + getAnalysisIfAvailable(); + DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, + DomTreeUpdater::UpdateStrategy::Eager); + auto TTI = &getAnalysis().getTTI(F); + for (BasicBlock *BB : CmpXchgLoopBlocks) { + simplifyCFG(BB, *TTI, RequireAndPreserveDomTree ? &DTU : nullptr, + SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true)); + } return MadeChange; } @@ -604,7 +634,7 @@ << AI->getOperationName(AI->getOperation()) << " operation at " << MemScope << " memory scope"; }); - expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun, CmpXchgLoopBlocks); } return true; } @@ -880,7 +910,8 @@ if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment, MemOpOrder, SSID, - PerformPartwordOp, createCmpXchgInstFun); + PerformPartwordOp, createCmpXchgInstFun, + CmpXchgLoopBlocks); } else { assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, @@ -1486,7 +1517,8 @@ IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg) { + CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks) { LLVMContext &Ctx = Builder.getContext(); BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); @@ -1508,8 +1540,9 @@ // [...] BasicBlock *ExitBB = BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + CmpXchgLoopBlocks.push_back(ExitBB); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - + CmpXchgLoopBlocks.push_back(LoopBB); // The split call above "helpfully" added a branch at the end of BB (to the // wrong place), but we want a load. It's easiest to just remove // the branch entirely. @@ -1566,8 +1599,9 @@ } // Note: This function is exposed externally by AtomicExpandUtils.h -bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, - CreateCmpXchgInstFun CreateCmpXchg) { +bool llvm::expandAtomicRMWToCmpXchg( + AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg, + SmallVector &CmpXchgLoopBlocks) { ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout()); Builder.setIsFPConstrained( AI->getFunction()->hasFnAttribute(Attribute::StrictFP)); @@ -1581,7 +1615,7 @@ return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded, AI->getValOperand()); }, - CreateCmpXchg); + CreateCmpXchg, CmpXchgLoopBlocks); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -1729,9 +1763,10 @@ // CAS libcall, via a CAS loop, instead. if (!Success) { expandAtomicRMWToCmpXchg( - I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, - Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, - SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { + I, + [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, + Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, + SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { // Create the CAS instruction normally... AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, Alignment, MemOpOrder, @@ -1741,7 +1776,8 @@ // ...and then expand the CAS into a libcall. expandAtomicCASToLibcall(Pair); - }); + }, + CmpXchgLoopBlocks); } } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -14,7 +14,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -50,7 +50,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value: @@ -85,7 +85,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value: @@ -119,7 +119,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value: @@ -155,7 +155,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value: @@ -190,7 +190,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value: @@ -230,7 +230,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value: @@ -268,7 +268,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: @@ -302,7 +302,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: @@ -338,7 +338,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: @@ -373,7 +373,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: @@ -413,7 +413,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: @@ -451,13 +451,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -530,7 +530,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value: @@ -576,7 +576,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value: @@ -624,7 +624,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value: @@ -668,7 +668,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value: @@ -722,7 +722,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value: @@ -772,7 +772,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value: @@ -824,7 +824,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: @@ -875,7 +875,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: @@ -920,7 +920,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: @@ -979,7 +979,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: @@ -1030,7 +1030,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -1038,7 +1038,7 @@ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -1072,7 +1072,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1105,7 +1105,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1137,7 +1137,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1169,7 +1169,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1200,7 +1200,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1236,7 +1236,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1270,7 +1270,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1302,7 +1302,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1334,7 +1334,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1365,7 +1365,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1401,7 +1401,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1435,13 +1435,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1510,7 +1510,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1554,7 +1554,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1598,7 +1598,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1638,7 +1638,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1688,7 +1688,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1734,7 +1734,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1784,7 +1784,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1831,7 +1831,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1872,7 +1872,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1927,7 +1927,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1974,7 +1974,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -1983,7 +1983,7 @@ } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -2017,7 +2017,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2050,7 +2050,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2082,7 +2082,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2114,7 +2114,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2145,7 +2145,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2156,66 +2156,40 @@ ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-NEXT: .LBB4_2: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-NEXT: .LBB4_2: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2247,7 +2221,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2279,7 +2253,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2310,7 +2284,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2321,73 +2295,47 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB4_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 -; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB4_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2456,7 +2404,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2500,7 +2448,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2544,7 +2492,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2584,12 +2532,12 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2604,7 +2552,7 @@ ; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1164-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2613,33 +2561,20 @@ ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: s_cbranch_execz .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB5_4: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2651,36 +2586,23 @@ ; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1132-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execz .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start -; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc -; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB5_4: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2730,7 +2652,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2777,7 +2699,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2818,7 +2740,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2847,33 +2769,20 @@ ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: global_atomic_add_f32 v0, v3, s[0:1] +; GFX1164-DPP-NEXT: .LBB5_2: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2895,35 +2804,24 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start -; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc -; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 -; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: global_atomic_add_f32 v0, v3, s[0:1] +; GFX1132-DPP-NEXT: .LBB5_2: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 ret void } + +attributes #0 = {"amdgpu-unsafe-fp-atomics"="true"} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -14,7 +14,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -47,7 +47,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value: @@ -79,7 +79,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value: @@ -110,7 +110,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value: @@ -143,7 +143,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value: @@ -175,7 +175,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value: @@ -211,7 +211,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value: @@ -246,7 +246,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: @@ -277,7 +277,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: @@ -310,7 +310,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: @@ -342,7 +342,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: @@ -378,7 +378,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: @@ -413,13 +413,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 seq_cst ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -498,7 +498,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value: @@ -548,7 +548,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value: @@ -600,7 +600,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value: @@ -648,7 +648,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value: @@ -706,7 +706,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value: @@ -759,7 +759,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value: @@ -831,7 +831,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: @@ -902,7 +902,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: @@ -965,7 +965,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: @@ -1049,7 +1049,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: @@ -1120,7 +1120,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -1128,7 +1128,7 @@ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1159,7 +1159,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1189,7 +1189,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1218,7 +1218,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1247,7 +1247,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1275,7 +1275,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1307,7 +1307,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1338,7 +1338,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1367,7 +1367,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1396,7 +1396,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1424,7 +1424,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1456,7 +1456,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1487,13 +1487,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1568,7 +1568,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1616,7 +1616,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1664,7 +1664,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1708,7 +1708,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1762,7 +1762,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1811,7 +1811,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1881,7 +1881,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1948,7 +1948,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2007,7 +2007,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2087,7 +2087,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2154,7 +2154,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -2163,7 +2163,7 @@ } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2194,7 +2194,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2224,7 +2224,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2253,7 +2253,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2282,7 +2282,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2310,7 +2310,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2342,7 +2342,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2373,7 +2373,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2402,7 +2402,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2431,7 +2431,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2459,7 +2459,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2491,7 +2491,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2522,14 +2522,14 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2604,7 +2604,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2652,7 +2652,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2700,7 +2700,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2744,7 +2744,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2798,7 +2798,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2847,7 +2847,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2917,7 +2917,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2984,7 +2984,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3043,7 +3043,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3123,7 +3123,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3190,10 +3190,12 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 ret void } + +attributes #0 = {"amdgpu-unsafe-fp-atomics"="true"} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -14,7 +14,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -47,7 +47,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value: @@ -79,7 +79,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value: @@ -110,7 +110,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value: @@ -143,7 +143,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value: @@ -175,7 +175,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value: @@ -211,7 +211,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value: @@ -246,7 +246,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: @@ -277,7 +277,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: @@ -310,7 +310,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: @@ -342,7 +342,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: @@ -378,7 +378,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: @@ -413,13 +413,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 seq_cst ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -498,7 +498,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value: @@ -548,7 +548,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value: @@ -600,7 +600,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value: @@ -648,7 +648,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value: @@ -706,7 +706,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value: @@ -759,7 +759,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value: @@ -831,7 +831,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: @@ -902,7 +902,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: @@ -965,7 +965,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: @@ -1049,7 +1049,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: @@ -1120,7 +1120,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -1128,7 +1128,7 @@ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1159,7 +1159,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1189,7 +1189,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1218,7 +1218,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1247,7 +1247,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1275,7 +1275,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1307,7 +1307,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1338,7 +1338,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1367,7 +1367,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1396,7 +1396,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1424,7 +1424,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1456,7 +1456,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1487,13 +1487,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1568,7 +1568,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1616,7 +1616,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1664,7 +1664,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1708,7 +1708,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1762,7 +1762,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1811,7 +1811,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1881,7 +1881,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1948,7 +1948,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2007,7 +2007,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2087,7 +2087,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2154,7 +2154,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -2163,7 +2163,7 @@ } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2194,7 +2194,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2224,7 +2224,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2253,7 +2253,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2282,7 +2282,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2310,7 +2310,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2342,7 +2342,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2373,7 +2373,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2402,7 +2402,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2431,7 +2431,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2459,7 +2459,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2491,7 +2491,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2522,14 +2522,14 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2604,7 +2604,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2652,7 +2652,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2700,7 +2700,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2744,7 +2744,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2798,7 +2798,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2847,7 +2847,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2917,7 +2917,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2984,7 +2984,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3043,7 +3043,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3123,7 +3123,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3190,10 +3190,12 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 ret void } + +attributes #0 = {"amdgpu-unsafe-fp-atomics"="true"} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -14,7 +14,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -50,7 +50,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 -; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value: @@ -85,7 +85,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB0_2 -; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value: @@ -119,7 +119,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value: @@ -155,7 +155,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value: @@ -190,7 +190,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value: @@ -230,7 +230,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value: @@ -268,7 +268,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: @@ -302,7 +302,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: @@ -338,7 +338,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: @@ -373,7 +373,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: @@ -413,7 +413,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: @@ -451,13 +451,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 -; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: .LBB0_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -530,7 +530,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB1_4 -; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value: @@ -576,7 +576,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB1_4 -; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value: @@ -624,7 +624,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value: @@ -668,7 +668,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value: @@ -722,7 +722,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value: @@ -772,7 +772,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 -; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: .LBB1_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value: @@ -835,7 +835,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: @@ -896,7 +896,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: @@ -951,7 +951,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: @@ -1023,7 +1023,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: @@ -1087,7 +1087,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 -; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: .LBB1_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -1095,7 +1095,7 @@ ret void } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -1129,7 +1129,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 -; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1162,7 +1162,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB2_2 -; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1194,7 +1194,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1226,7 +1226,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1257,7 +1257,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1293,7 +1293,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1327,7 +1327,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1359,7 +1359,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1391,7 +1391,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1422,7 +1422,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1458,7 +1458,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: @@ -1492,13 +1492,13 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 -; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: .LBB2_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1567,7 +1567,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB3_4 -; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1611,7 +1611,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1655,7 +1655,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1695,7 +1695,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1745,7 +1745,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1791,7 +1791,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 -; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: .LBB3_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1852,7 +1852,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1909,7 +1909,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -1960,7 +1960,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2028,7 +2028,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: @@ -2088,7 +2088,7 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 -; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: .LBB3_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float @@ -2097,7 +2097,7 @@ } -define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec @@ -2131,7 +2131,7 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 -; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2164,7 +2164,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB4_2 -; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2196,7 +2196,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2228,7 +2228,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2259,7 +2259,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2295,7 +2295,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2329,7 +2329,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2361,7 +2361,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2393,7 +2393,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2424,7 +2424,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2460,7 +2460,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: @@ -2494,14 +2494,14 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 -; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: .LBB4_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 ret void } -define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2570,7 +2570,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2614,7 +2614,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB5_4 -; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2658,7 +2658,7 @@ ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2698,7 +2698,7 @@ ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2748,7 +2748,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2794,7 +2794,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 -; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: .LBB5_5: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2855,7 +2855,7 @@ ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2912,7 +2912,7 @@ ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -2963,7 +2963,7 @@ ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3031,7 +3031,7 @@ ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: @@ -3091,10 +3091,12 @@ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 -; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: .LBB5_3: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %id.x = call i32 @llvm.amdgcn.workitem.id.x() %divValue = bitcast i32 %id.x to float %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 ret void } + +attributes #0 = {"amdgpu-unsafe-fp-atomics"="true"}