diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -202,9 +202,16 @@ case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: break; } + // Only 32-bit floating point atomic ops are supported. + if (AtomicRMWInst::isFPOperation(Op) && !I.getType()->isFloatTy()) { + return; + } + const unsigned PtrIdx = 0; const unsigned ValIdx = 1; @@ -305,7 +312,6 @@ } const unsigned ValIdx = 0; - const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different @@ -344,8 +350,12 @@ llvm_unreachable("Unhandled atomic op"); case AtomicRMWInst::Add: return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::FAdd: + return B.CreateBinOp(Instruction::FAdd, LHS, RHS); case AtomicRMWInst::Sub: return B.CreateBinOp(Instruction::Sub, LHS, RHS); + case AtomicRMWInst::FSub: + return B.CreateBinOp(Instruction::FSub, LHS, RHS); case AtomicRMWInst::And: return B.CreateBinOp(Instruction::And, LHS, RHS); case AtomicRMWInst::Or: @@ -376,75 +386,91 @@ AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const { - Type *const Ty = V->getType(); + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, IntNTy); // Reduce within each row of 16 lanes. for (unsigned Idx = 0; Idx < 4; Idx++) { - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + V = B.CreateBitCast(V, IntNTy); + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); } // Reduce within each pair of rows (i.e. 32 lanes). assert(ST->hasPermLaneX16()); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, - {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); - - if (ST->isWave32()) + V = B.CreateBitCast(V, IntNTy); + Value *Permlanex16Call = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); + Permlanex16Call = B.CreateBitCast(Permlanex16Call, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), Permlanex16Call); + if (ST->isWave32()) { return V; + } if (ST->hasPermLane64()) { // Reduce across the upper and lower 32 lanes. - return buildNonAtomicBinOp( - B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); + V = B.CreateBitCast(V, IntNTy); + Value *Permlane64Call = + B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); + Permlane64Call = B.CreateBitCast(Permlane64Call, AtomicTy); + return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), + Permlane64Call); } // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); - Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); - return buildNonAtomicBinOp(B, Op, Lane0, Lane32); + V = B.CreateBitCast(V, IntNTy); + Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); + Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); + return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy), + B.CreateBitCast(Lane32, AtomicTy)); } // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, - Value *const Identity) const { - Type *const Ty = V->getType(); + Value *Identity) const { + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); + Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, IntNTy); for (unsigned Idx = 0; Idx < 4; Idx++) { - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + V = B.CreateBitCast(V, IntNTy); + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); } + if (ST->hasDPPBroadcasts()) { // GFX9 has DPP row broadcast operations. - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), - B.getInt32(0xf), B.getFalse()})); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), - B.getInt32(0xf), B.getFalse()})); + V = B.CreateBitCast(V, IntNTy); + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), + B.getInt32(0xf), B.getFalse()}); + + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); + V = B.CreateBitCast(V, IntNTy); + UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), + B.getInt32(0xf), B.getFalse()}); + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); } else { // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. @@ -452,23 +478,31 @@ // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). assert(ST->hasPermLaneX16()); - Value *const PermX = B.CreateIntrinsic( + V = B.CreateBitCast(V, IntNTy); + Value *PermX = B.CreateIntrinsic( Intrinsic::amdgcn_permlanex16, {}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), - B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); + + PermX = B.CreateBitCast(PermX, IntNTy); + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}); + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); + if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. + V = B.CreateBitCast(V, IntNTy); Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, B.getInt32(31)}); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), - B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); + + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); + + UpdateDPPCall = B.CreateBitCast(UpdateDPPCall, AtomicTy); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), + UpdateDPPCall); } } return V; @@ -477,17 +511,21 @@ // Use the builder to create a shift right of V across the wavefront, with all // lanes active, to turn an inclusive scan into an exclusive scan. Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, - Value *const Identity) const { - Type *const Ty = V->getType(); + Value *Identity) const { + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); + Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, IntNTy); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. + V = B.CreateBitCast(V, IntNTy); V = B.CreateCall(UpdateDPP, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + V = B.CreateBitCast(V, AtomicTy); + } else { Function *ReadLane = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); @@ -497,16 +535,21 @@ // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. Value *Old = V; + V = B.CreateBitCast(V, IntNTy); V = B.CreateCall(UpdateDPP, {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - + V = B.CreateBitCast(V, IntNTy); + Old = B.CreateBitCast(Old, IntNTy); // Copy the old lane 15 to the new lane 16. V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), B.getInt32(16), V}); - + V = B.CreateBitCast(V, AtomicTy); + Old = B.CreateBitCast(Old, AtomicTy); if (!ST->isWave32()) { // Copy the old lane 31 to the new lane 32. + V = B.CreateBitCast(V, IntNTy); + Old = B.CreateBitCast(Old, IntNTy); V = B.CreateCall( WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); @@ -515,6 +558,7 @@ V = B.CreateCall( WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); + V = B.CreateBitCast(V, AtomicTy); } } @@ -529,7 +573,6 @@ std::pair AMDGPUAtomicOptimizerImpl::buildScanIteratively( IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { - auto *Ty = I.getType(); auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); auto *EntryBB = I.getParent(); @@ -554,18 +597,25 @@ // Use llvm.cttz instrinsic to find the lowest remaining active lane. auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); - auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); + auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy); // Get the value required for atomic operation - auto *LaneValue = + V = B.CreateBitCast(V, IntNTy); + Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + LaneValue = B.CreateBitCast(LaneValue, Ty); // Perform writelane if intermediate scan results are required later in the // kernel computations Value *OldValue = nullptr; if (NeedResult) { - OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, - {Accumulator, LaneIdxInt, OldValuePhi}); + OldValue = + B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, + {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt, + B.CreateBitCast(OldValuePhi, IntNTy)}); + OldValue = B.CreateBitCast(OldValue, Ty); OldValuePhi->addIncoming(OldValue, ComputeLoop); } @@ -590,8 +640,10 @@ return {OldValue, NewAccumulator}; } -static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, - unsigned BitWidth) { +static Constant *getIdentityValueForAtomicOp(Type *const Ty, + AtomicRMWInst::BinOp Op) { + LLVMContext &C = Ty->getContext(); + const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); switch (Op) { default: llvm_unreachable("Unhandled atomic op"); @@ -600,14 +652,18 @@ case AtomicRMWInst::Or: case AtomicRMWInst::Xor: case AtomicRMWInst::UMax: - return APInt::getMinValue(BitWidth); + return ConstantInt::get(C, APInt::getMinValue(BitWidth)); case AtomicRMWInst::And: case AtomicRMWInst::UMin: - return APInt::getMaxValue(BitWidth); + return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); case AtomicRMWInst::Max: - return APInt::getSignedMinValue(BitWidth); + return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); case AtomicRMWInst::Min: - return APInt::getSignedMaxValue(BitWidth); + return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); + case AtomicRMWInst::FAdd: + return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); + case AtomicRMWInst::FSub: + return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); } } @@ -649,12 +705,15 @@ } Type *const Ty = I.getType(); + Type *Int32Ty = B.getInt32Ty(); + Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); + bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); - auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); + auto *const VecTy = FixedVectorType::get(Int32Ty, 2); // This is the value in the atomic operation we need to combine in order to // reduce the number of atomic operations. - Value *const V = I.getOperand(ValIdx); + Value *V = I.getOperand(ValIdx); // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. @@ -679,17 +738,16 @@ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); } - Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); - Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + Value *Identity = getIdentityValueForAtomicOp(Ty, Op); Value *ExclScan = nullptr; Value *NewV = nullptr; const bool NeedResult = !I.use_empty(); - Function *F = I.getFunction(); - LLVMContext &C = F->getContext(); BasicBlock *ComputeLoop = nullptr; BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value @@ -700,8 +758,12 @@ if (ScanImpl == ScanOptions::DPP) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - NewV = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + V = B.CreateBitCast(V, IntNTy); + Identity = B.CreateBitCast(Identity, IntNTy); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy, + {V, Identity}); + NewV = B.CreateBitCast(NewV, Ty); + V = B.CreateBitCast(V, Ty); const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; if (!NeedResult && ST->hasPermLaneX16()) { @@ -718,8 +780,10 @@ // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); assert(TyBitWidth == 32); + NewV = B.CreateBitCast(NewV, IntNTy); NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); + NewV = B.CreateBitCast(NewV, Ty); } // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); @@ -746,7 +810,14 @@ NewV = buildMul(B, V, Ctpop); break; } - + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); + Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); + NewV = B.CreateFMul(V, CtpopFP); + break; + } case AtomicRMWInst::And: case AtomicRMWInst::Or: case AtomicRMWInst::Max: @@ -771,7 +842,7 @@ // We only want a single lane to enter our new control flow, and we do this // by checking if there are any active lanes below us. Only one lane will // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -840,9 +911,8 @@ Value *BroadcastI = nullptr; if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); + Value *const ExtractLo = B.CreateTrunc(PHI, Int32Ty); + Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(PHI, 32), Int32Ty); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); CallInst *const ReadFirstLaneHi = @@ -853,8 +923,11 @@ B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { + Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); + BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi); + BroadcastI = B.CreateBitCast(BroadcastI, Ty); - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -874,6 +947,8 @@ llvm_unreachable("Atomic Optimzer is disabled for None strategy"); } } else { + Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) + : B.CreateIntCast(Mbcnt, Ty, false); switch (Op) { default: llvm_unreachable("Unhandled atomic op"); @@ -892,6 +967,11 @@ case AtomicRMWInst::Xor: LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); break; + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + LaneOffset = B.CreateFMul(V, Mbcnt); + break; + } } } Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic @@ -15,6 +15,7 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -41,6 +42,7 @@ ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -67,6 +69,7 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -93,6 +96,7 @@ ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -119,6 +123,7 @@ ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -134,27 +139,81 @@ } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX90A_GFX940: bb.1 (%ir-block.0): + ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5): + ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY6]], [[COPY7]], implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY8]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY9]], implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY16]], implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY17]], implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.54): + ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.4.Flow: + ; GFX90A_GFX940-NEXT: successors: %bb.6(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.6 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.56): + ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.6 (%ir-block.57): ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic @@ -15,6 +15,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -43,6 +44,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -71,6 +73,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -99,6 +102,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -127,6 +131,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -143,29 +148,90 @@ } define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE + ; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.2 (%ir-block.5): + ; GFX11-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 15 + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_3]] + ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 [[V_READLANE_B32_]], [[S_MOV_B32_4]], [[V_MOV_B32_dpp5]] + ; GFX11-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 31 + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], [[S_MOV_B32_5]] + ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_1]] + ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY13]], implicit $exec + ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY14]], implicit $exec + ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.3 (%ir-block.52): + ; GFX11-NEXT: successors: %bb.5(0x80000000) + ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: S_BRANCH %bb.5 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.4.Flow: + ; GFX11-NEXT: successors: %bb.6(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %40, %bb.5, [[DEF]], %bb.1 + ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.6 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.5 (%ir-block.55): + ; GFX11-NEXT: successors: %bb.4(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 + ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.6 (%ir-block.63): + ; GFX11-NEXT: $vgpr0 = COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -994,19 +994,19 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s5, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1016,8 +1016,8 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc @@ -1028,19 +1028,19 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1050,7 +1050,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,18 +1062,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1083,7 +1083,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1094,20 +1094,20 @@ ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: @@ -1116,7 +1116,7 @@ ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1126,19 +1126,19 @@ ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: @@ -1147,7 +1147,7 @@ ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v3, 5, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1157,22 +1157,22 @@ ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: @@ -1181,7 +1181,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v3, 5, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1193,21 +1193,21 @@ ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] +; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: @@ -1216,7 +1216,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v3, 5, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1239,13 +1239,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -1282,7 +1282,6 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1293,8 +1292,9 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1321,7 +1321,6 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1335,6 +1334,7 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1358,7 +1358,6 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1367,6 +1366,7 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1396,14 +1396,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -1433,9 +1433,8 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1443,6 +1442,7 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1476,15 +1476,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -2588,19 +2588,19 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s5, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2610,8 +2610,8 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -2622,19 +2622,19 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2642,8 +2642,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v3 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -2656,18 +2656,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2675,8 +2675,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v3 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -2688,20 +2688,20 @@ ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: @@ -2709,9 +2709,9 @@ ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -2723,19 +2723,19 @@ ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: @@ -2743,9 +2743,9 @@ ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -2757,31 +2757,31 @@ ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc @@ -2796,30 +2796,30 @@ ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_mov_b32 s3, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo @@ -2845,13 +2845,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -2888,7 +2888,6 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2899,8 +2898,9 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2928,7 +2928,6 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2942,6 +2941,7 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2967,7 +2967,6 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2976,6 +2975,7 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3008,14 +3008,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -3048,9 +3048,8 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3058,6 +3057,7 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3093,15 +3093,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll @@ -21,7 +21,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_agent(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -30,7 +30,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wg(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -39,7 +39,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -48,7 +48,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -57,7 +57,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -66,7 +66,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -75,7 +75,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -84,7 +84,7 @@ } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic @@ -16,6 +16,7 @@ ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -43,6 +44,7 @@ ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -70,6 +72,7 @@ ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -97,6 +100,7 @@ ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -124,6 +128,7 @@ ; GFX908_GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -140,27 +145,72 @@ } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX908_GFX11: bb.0 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE + ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.1 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5): + ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1 + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY5]], [[COPY7]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY6]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] + ; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.54): + ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.3.Flow: + ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.56): + ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.57): ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic @@ -16,6 +16,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic ; GFX11: bb.0 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -45,6 +46,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic ; GFX11: bb.0 (%ir-block.0): ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -74,6 +76,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic ; GFX11: bb.0 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -103,6 +106,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic ; GFX11: bb.0 (%ir-block.0): ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -132,6 +136,7 @@ ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -149,29 +154,81 @@ } define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): + ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE + ; GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.1 (%ir-block.5): + ; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, killed [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 273, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 15 + ; GFX11-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_2]] + ; GFX11-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GFX11-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed [[V_READLANE_B32_]], killed [[S_MOV_B32_3]], [[V_MOV_B32_dpp5]] + ; GFX11-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 31 + ; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]] + ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.2 (%ir-block.52): + ; GFX11-NEXT: successors: %bb.4(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY5]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: S_BRANCH %bb.4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.3.Flow: + ; GFX11-NEXT: successors: %bb.5(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4 + ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.5 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.4 (%ir-block.55): + ; GFX11-NEXT: successors: %bb.3(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec + ; GFX11-NEXT: early-clobber %43:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %43, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.5 (%ir-block.63): + ; GFX11-NEXT: $vgpr0 = COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -5,19 +5,29 @@ ; instructions not actually supported by the subtarget. ; FIXME: This will still fail for gfx6/7 and gfx10 subtargets. -; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9]+}}: DD348000 -; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9]+}}: 00000100 +; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000 +; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 4.0 +; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -203,13 +203,29 @@ ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB1_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB1_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX90A-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -243,14 +259,29 @@ ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB1_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -287,24 +318,44 @@ ; ; GFX908-LABEL: global_atomic_fadd_noret_f32: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB2_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: .LBB2_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB2_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB2_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32: @@ -335,14 +386,23 @@ ; ; GFX11-LABEL: global_atomic_fadd_noret_f32: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB2_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -375,24 +435,44 @@ ; ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB3_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: .LBB3_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB3_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: @@ -423,14 +503,23 @@ ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB3_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -491,13 +580,28 @@ ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB4_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB4_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -531,14 +635,29 @@ ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB4_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -751,24 +870,44 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 4.0 +; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: .LBB7_2: ; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -929,24 +1068,42 @@ ; ; GFX908-LABEL: infer_as_before_atomic: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB9_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX908-NEXT: .LBB9_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: infer_as_before_atomic: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB9_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX90A-NEXT: .LBB9_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: @@ -975,12 +1132,21 @@ ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: .LBB9_2: ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -4,12 +4,68 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-ITERATIVE-NEXT: br label [[TMP24]] +; IR-ITERATIVE: 24: +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP25]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-DPP-NEXT: br label [[TMP24]] +; IR-DPP: 24: +; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -17,12 +73,120 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float +; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 +; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP60:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP43]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP46]], i32 63) +; IR-DPP-NEXT: [[TMP48:%.*]] = bitcast i32 [[TMP47]] to float +; IR-DPP-NEXT: [[TMP49:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP48]]) +; IR-DPP-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP50]], label [[TMP51:%.*]], label [[TMP53:%.*]] +; IR-DPP: 51: +; IR-DPP-NEXT: [[TMP52:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP49]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP53]] +; IR-DPP: 53: +; IR-DPP-NEXT: [[TMP54:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP52]], [[TMP51]] ] +; IR-DPP-NEXT: [[TMP55:%.*]] = bitcast float [[TMP54]] to i32 +; IR-DPP-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP55]]) +; IR-DPP-NEXT: [[TMP57:%.*]] = bitcast i32 [[TMP56]] to float +; IR-DPP-NEXT: [[TMP58:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP59:%.*]] = fadd float [[TMP57]], [[TMP58]] +; IR-DPP-NEXT: br label [[TMP60]] +; IR-DPP: 60: +; IR-DPP-NEXT: [[TMP61:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP59]], [[TMP53]] ] +; IR-DPP-NEXT: ret float [[TMP61]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -30,12 +194,68 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-ITERATIVE-NEXT: br label [[TMP24]] +; IR-ITERATIVE: 24: +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP25]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-DPP-NEXT: br label [[TMP24]] +; IR-DPP: 24: +; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result @@ -43,12 +263,120 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float +; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 +; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP60:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP43]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP46]], i32 63) +; IR-DPP-NEXT: [[TMP48:%.*]] = bitcast i32 [[TMP47]] to float +; IR-DPP-NEXT: [[TMP49:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP48]]) +; IR-DPP-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP50]], label [[TMP51:%.*]], label [[TMP53:%.*]] +; IR-DPP: 51: +; IR-DPP-NEXT: [[TMP52:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP49]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP53]] +; IR-DPP: 53: +; IR-DPP-NEXT: [[TMP54:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP52]], [[TMP51]] ] +; IR-DPP-NEXT: [[TMP55:%.*]] = bitcast float [[TMP54]] to i32 +; IR-DPP-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP55]]) +; IR-DPP-NEXT: [[TMP57:%.*]] = bitcast i32 [[TMP56]] to float +; IR-DPP-NEXT: [[TMP58:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP59:%.*]] = fadd float [[TMP57]], [[TMP58]] +; IR-DPP-NEXT: br label [[TMP60]] +; IR-DPP: 60: +; IR-DPP-NEXT: [[TMP61:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP59]], [[TMP53]] ] +; IR-DPP-NEXT: ret float [[TMP61]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result @@ -56,12 +384,68 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-ITERATIVE-NEXT: br label [[TMP24]] +; IR-ITERATIVE: 24: +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP25]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-DPP-NEXT: br label [[TMP24]] +; IR-DPP: 24: +; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -70,12 +454,120 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fsub float [[TMP16]], [[TMP28:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float +; IR-ITERATIVE-NEXT: [[TMP29]] = fsub float [[ACCUMULATOR]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 +; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP60:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 0) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fsub float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fsub float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fsub float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fsub float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fsub float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fsub float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP43]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP46]], i32 63) +; IR-DPP-NEXT: [[TMP48:%.*]] = bitcast i32 [[TMP47]] to float +; IR-DPP-NEXT: [[TMP49:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP48]]) +; IR-DPP-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP50]], label [[TMP51:%.*]], label [[TMP53:%.*]] +; IR-DPP: 51: +; IR-DPP-NEXT: [[TMP52:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP49]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP53]] +; IR-DPP: 53: +; IR-DPP-NEXT: [[TMP54:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP52]], [[TMP51]] ] +; IR-DPP-NEXT: [[TMP55:%.*]] = bitcast float [[TMP54]] to i32 +; IR-DPP-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP55]]) +; IR-DPP-NEXT: [[TMP57:%.*]] = bitcast i32 [[TMP56]] to float +; IR-DPP-NEXT: [[TMP58:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP59:%.*]] = fsub float [[TMP57]], [[TMP58]] +; IR-DPP-NEXT: br label [[TMP60]] +; IR-DPP: 60: +; IR-DPP-NEXT: [[TMP61:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP59]], [[TMP53]] ] +; IR-DPP-NEXT: ret float [[TMP61]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -135,12 +627,68 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-ITERATIVE-NEXT: br label [[TMP24]] +; IR-ITERATIVE: 24: +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP25]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-DPP-NEXT: br label [[TMP24]] +; IR-DPP: 24: +; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-DPP-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result @@ -148,12 +696,120 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP18:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP29:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[COMPUTEEND:%.*]] ], [ [[TMP11]], [[TMP10:%.*]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP28:%.*]] +; IR-ITERATIVE-NEXT: br label [[TMP18]] +; IR-ITERATIVE: 18: +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP17]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret float [[TMP19]] +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP29]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[OLDVALUEPHI:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP28]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP32:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP22]], i32 [[TMP21]]) +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = bitcast float [[ACCUMULATOR]] to i32 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32 +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) +; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float +; IR-ITERATIVE-NEXT: [[TMP29]] = fadd float [[ACCUMULATOR]], [[TMP24]] +; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]] +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1 +; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]] +; IR-ITERATIVE-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP33]], label [[COMPUTEEND]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP34]], label [[TMP10]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP60:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP43]], i32 312, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP46]], i32 63) +; IR-DPP-NEXT: [[TMP48:%.*]] = bitcast i32 [[TMP47]] to float +; IR-DPP-NEXT: [[TMP49:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP48]]) +; IR-DPP-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP50]], label [[TMP51:%.*]], label [[TMP53:%.*]] +; IR-DPP: 51: +; IR-DPP-NEXT: [[TMP52:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP49]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP53]] +; IR-DPP: 53: +; IR-DPP-NEXT: [[TMP54:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP52]], [[TMP51]] ] +; IR-DPP-NEXT: [[TMP55:%.*]] = bitcast float [[TMP54]] to i32 +; IR-DPP-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP55]]) +; IR-DPP-NEXT: [[TMP57:%.*]] = bitcast i32 [[TMP56]] to float +; IR-DPP-NEXT: [[TMP58:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP59:%.*]] = fadd float [[TMP57]], [[TMP58]] +; IR-DPP-NEXT: br label [[TMP60]] +; IR-DPP: 60: +; IR-DPP-NEXT: [[TMP61:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP59]], [[TMP53]] ] +; IR-DPP-NEXT: ret float [[TMP61]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -0,0 +1,285 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fadd float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fadd float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fadd float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fsub float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fsub_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fsub float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fsub float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fsub float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fsub float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fsub float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fsub float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +attributes #0 = {"target-cpu"="gfx906"} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -4,11 +4,51 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 @@ -17,11 +57,98 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 +; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP51:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP43]], i32 63) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP47]], label [[TMP48:%.*]], label [[TMP50:%.*]] +; IR-DPP: 48: +; IR-DPP-NEXT: [[TMP49:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP46]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP50]] +; IR-DPP: 50: +; IR-DPP-NEXT: br label [[TMP51]] +; IR-DPP: 51: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 @@ -30,11 +157,51 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic @@ -43,11 +210,98 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 +; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP51:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP43]], i32 63) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP47]], label [[TMP48:%.*]], label [[TMP50:%.*]] +; IR-DPP: 48: +; IR-DPP-NEXT: [[TMP49:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP46]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP50]] +; IR-DPP: 50: +; IR-DPP-NEXT: br label [[TMP51]] +; IR-DPP: 51: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic @@ -56,11 +310,51 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic @@ -70,11 +364,98 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-ITERATIVE-NEXT: [[TMP19]] = fsub float [[ACCUMULATOR]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 +; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP51:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 0) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fsub float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fsub float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fsub float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fsub float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fsub float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fsub float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP43]], i32 63) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP47]], label [[TMP48:%.*]], label [[TMP50:%.*]] +; IR-DPP: 48: +; IR-DPP-NEXT: [[TMP49:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP46]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP50]] +; IR-DPP: 50: +; IR-DPP-NEXT: br label [[TMP51]] +; IR-DPP: 51: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic @@ -135,11 +516,51 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 @@ -148,11 +569,98 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP19:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP12:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP2]] ], [ [[TMP19]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP9]], [[TMP2]] ], [ [[TMP22:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-ITERATIVE-NEXT: [[TMP19]] = fadd float [[ACCUMULATOR]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]] +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1 +; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = icmp eq i64 [[TMP22]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP23]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP24]], label [[TMP10:%.*]], label [[TMP12]] ; ; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP51:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <2 x i32> +; IR-DPP-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; IR-DPP-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP5]], i32 0) +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast float [[VAL:%.*]] to i32 +; IR-DPP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP9]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float +; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast float [[TMP11]] to i32 +; IR-DPP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP13]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP13]] to float +; IR-DPP-NEXT: [[TMP17:%.*]] = fadd float [[TMP16]], [[TMP15]] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP18]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast i32 [[TMP18]] to float +; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP20]] +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32 +; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP23]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast i32 [[TMP23]] to float +; IR-DPP-NEXT: [[TMP27:%.*]] = fadd float [[TMP26]], [[TMP25]] +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast float [[TMP27]] to i32 +; IR-DPP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP28]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP30:%.*]] = bitcast i32 [[TMP29]] to float +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast i32 [[TMP28]] to float +; IR-DPP-NEXT: [[TMP32:%.*]] = fadd float [[TMP31]], [[TMP30]] +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast float [[TMP32]] to i32 +; IR-DPP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP33]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast i32 [[TMP33]] to float +; IR-DPP-NEXT: [[TMP37:%.*]] = fadd float [[TMP36]], [[TMP35]] +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast float [[TMP37]] to i32 +; IR-DPP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP38]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast i32 [[TMP38]] to float +; IR-DPP-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[TMP40]] +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast float [[TMP42]] to i32 +; IR-DPP-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP43]], i32 63) +; IR-DPP-NEXT: [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float +; IR-DPP-NEXT: [[TMP46:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP45]]) +; IR-DPP-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP47]], label [[TMP48:%.*]], label [[TMP50:%.*]] +; IR-DPP: 48: +; IR-DPP-NEXT: [[TMP49:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP46]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP50]] +; IR-DPP: 50: +; IR-DPP-NEXT: br label [[TMP51]] +; IR-DPP: 51: ; IR-DPP-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -109,21 +109,44 @@ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -196,21 +219,44 @@ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB0_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB0_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm @@ -410,41 +456,70 @@ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-NEXT: s_mov_b32 s12, s8 -; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_mov_b32 s13, s9 -; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-NEXT: s_mov_b32 s14, s10 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-NEXT: s_getpc_b64 s[6:7] -; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB1_4: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-NEXT: s_getpc_b64 s[6:7] -; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -452,7 +527,33 @@ ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB1_4: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -597,41 +698,88 @@ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB1_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -639,7 +787,47 @@ ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB1_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm @@ -1836,41 +2024,70 @@ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-NEXT: s_mov_b32 s12, s8 -; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_mov_b32 s13, s9 -; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-NEXT: s_mov_b32 s14, s10 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-NEXT: s_getpc_b64 s[6:7] -; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB5_4: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-NEXT: s_getpc_b64 s[6:7] -; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -1878,7 +2095,33 @@ ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB5_4: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2023,41 +2266,88 @@ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB5_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -2065,7 +2355,47 @@ ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB5_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm @@ -2267,41 +2597,70 @@ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-NEXT: s_mov_b32 s12, s8 -; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_mov_b32 s13, s9 -; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-NEXT: s_mov_b32 s14, s10 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-NEXT: s_getpc_b64 s[6:7] -; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: .LBB6_4: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-NEXT: s_getpc_b64 s[6:7] -; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -2309,7 +2668,33 @@ ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: .LBB6_4: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2454,41 +2839,88 @@ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1164-DPP-NEXT: .LBB6_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] -; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -2496,7 +2928,47 @@ ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1] +; GFX1132-DPP-NEXT: .LBB6_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -143,24 +143,88 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; VI-LABEL: lds_ds_fadd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: +; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_lshl_b32 s8, s3, 3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s3, 3 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: .LBB2_2: +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: s_mov_b64 s[6:7], exec ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB2_4 +; VI-NEXT: ; %bb.3: +; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: ds_add_f32 v2, v0 offset:64 +; VI-NEXT: ds_add_f32 v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: .LBB2_4: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_add_f32_e32 v2, s8, v0 +; VI-NEXT: v_bfrev_b32_e32 v1, 1 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: .LBB2_5: ; %ComputeLoop +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_ff1_i32_b32 s3, s5 +; VI-NEXT: s_ff1_i32_b32 s6, s4 +; VI-NEXT: s_add_i32 s3, s3, 32 +; VI-NEXT: s_min_u32 s3, s6, s3 +; VI-NEXT: s_lshl_b64 s[6:7], 1, s3 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readlane_b32 s9, v2, s3 +; VI-NEXT: s_mov_b32 m0, s3 +; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; VI-NEXT: v_writelane_b32 v0, s8, m0 +; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-NEXT: v_add_f32_e32 v1, s9, v1 +; VI-NEXT: s_cbranch_scc1 .LBB2_5 +; VI-NEXT: ; %bb.6: ; %ComputeEnd +; VI-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_8 +; VI-NEXT: ; %bb.7: +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: .LBB2_8: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s2, v2 +; VI-NEXT: v_add_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -168,24 +232,87 @@ ; ; GFX9-LABEL: lds_ds_fadd: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s3, s3, 4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_lshl_b32 s8, s3, 3 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s4, s3, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 4 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: ds_add_f32 v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: .LBB2_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: .LBB2_5: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s5 +; GFX9-NEXT: s_ff1_i32_b32 s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s6, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readlane_b32 s9, v2, s3 +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_writelane_b32 v0, s8, m0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.6: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -344,41 +471,164 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; VI-LABEL: lds_ds_fadd_one_as: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: +; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_lshl_b32 s8, s3, 3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 +; VI-NEXT: .LBB3_2: +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: s_mov_b64 s[6:7], exec ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s3, 3 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cbranch_execz .LBB3_4 +; VI-NEXT: ; %bb.3: +; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: ds_add_f32 v2, v0 offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: ds_add_f32 v2, v1 +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: v_add_f32_e32 v2, s8, v0 +; VI-NEXT: v_bfrev_b32_e32 v1, 1 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: .LBB3_5: ; %ComputeLoop +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_ff1_i32_b32 s3, s5 +; VI-NEXT: s_ff1_i32_b32 s6, s4 +; VI-NEXT: s_add_i32 s3, s3, 32 +; VI-NEXT: s_min_u32 s3, s6, s3 +; VI-NEXT: s_lshl_b64 s[6:7], 1, s3 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readlane_b32 s9, v2, s3 +; VI-NEXT: s_mov_b32 m0, s3 +; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; VI-NEXT: v_writelane_b32 v0, s8, m0 +; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-NEXT: v_add_f32_e32 v1, s9, v1 +; VI-NEXT: s_cbranch_scc1 .LBB3_5 +; VI-NEXT: ; %bb.6: ; %ComputeEnd +; VI-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_8 +; VI-NEXT: ; %bb.7: +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 +; VI-NEXT: .LBB3_8: +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s2, v2 +; VI-NEXT: v_add_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_ds_fadd_one_as: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s3, s3, 4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_lshl_b32 s8, s3, 3 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s4, s3, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 4 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_add_f32 v2, v1 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: .LBB3_5: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s5 +; GFX9-NEXT: s_ff1_i32_b32 s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s6, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readlane_b32 s9, v2, s3 +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: v_writelane_b32 v0, s8, m0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX9-NEXT: ; %bb.6: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX9-NEXT: .LBB3_8: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ;