diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -85,7 +85,8 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -FunctionPass *createAMDGPUAtomicOptimizerPass(); +enum class ScanOptions : bool { DPP, Iterative }; +FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy); void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); extern char &AMDGPUAtomicOptimizerID; @@ -234,11 +235,13 @@ }; struct AMDGPUAtomicOptimizerPass : PassInfoMixin { - AMDGPUAtomicOptimizerPass(TargetMachine &TM) : TM(TM) {} + AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl) + : TM(TM), ScanImpl(ScanImpl) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); private: TargetMachine &TM; + ScanOptions ScanImpl; }; Pass *createAMDGPUStructurizeCFGPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -10,7 +10,15 @@ /// This pass optimizes atomic operations by using a single lane of a wavefront /// to perform the atomic operation, thus reducing contention on that memory /// location. -// +/// Atomic optimizer uses following strategies to compute scan and reduced +/// values +/// 1. DPP - +/// This is the most efficient implementation for scan. DPP uses Whole Wave +/// Mode (WWM) +/// 2. Iterative - +// An alternative implementation iterates over all active lanes +/// of Wavefront using llvm.cttz and performs scan using readlane & writelane +/// intrinsics //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -42,8 +50,9 @@ class AMDGPUAtomicOptimizer : public FunctionPass { public: static char ID; - - AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + ScanOptions ScanImpl; + AMDGPUAtomicOptimizer(ScanOptions ScanImpl) + : FunctionPass(ID), ScanImpl(ScanImpl) {} bool runOnFunction(Function &F) override; @@ -63,6 +72,7 @@ DomTreeUpdater &DTU; const GCNSubtarget *ST; bool IsPixelShader; + ScanOptions ScanImpl; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; @@ -70,6 +80,11 @@ Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *V, Instruction &I, + BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -78,8 +93,9 @@ AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, DomTreeUpdater &DTU, const GCNSubtarget *ST, - bool IsPixelShader) - : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader) {} + bool IsPixelShader, ScanOptions ScanImpl) + : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), + ScanImpl(ScanImpl) {} bool run(Function &F); @@ -113,7 +129,8 @@ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; - return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader).run(F); + return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) + .run(F); } PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, @@ -128,7 +145,8 @@ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; - return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader).run(F) + return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) + .run(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); } @@ -491,6 +509,75 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over active lanes to read, compute and update the value using +// readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizerImpl::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, + Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { + + auto *Ty = I.getType(); + auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); + auto *EntryBB = I.getParent(); + auto NeedResult = !I.use_empty(); + + auto *Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); + + // Start inserting instructions for ComputeLoop block + B.SetInsertPoint(ComputeLoop); + // Phi nodes for Accumulator, Scan results destination, and Active Lanes + auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); + Accumulator->addIncoming(Identity, EntryBB); + PHINode *OldValuePhi = nullptr; + if (NeedResult) { + OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); + OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); + } + auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); + ActiveBits->addIncoming(Ballot, EntryBB); + + // Use llvm.cttz instrinsic to find the lowest remaining active lane. + auto *FF1 = + B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); + auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + // Get the value required for atomic operation + auto *LaneValue = + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + + // Perform writelane if intermediate scan results are required later in the + // kernel computations + Value *OldValue = nullptr; + if (NeedResult) { + OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, + {Accumulator, LaneIdxInt, OldValuePhi}); + OldValuePhi->addIncoming(OldValue, ComputeLoop); + } + + // Accumulate the results + auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); + Accumulator->addIncoming(NewAccumulator, ComputeLoop); + + // Set bit to zero of current active lane so that for next iteration llvm.cttz + // return the next active lane + auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); + + auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); + auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); + ActiveBits->addIncoming(NewActiveBits, ComputeLoop); + + // Branch out of the loop when all lanes are processed. + auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); + B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); + + B.SetInsertPoint(ComputeEnd); + + return {OldValue, NewAccumulator}; +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -589,36 +676,48 @@ const bool NeedResult = !I.use_empty(); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + BasicBlock *ComputeLoop = nullptr; + BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (ScanImpl == ScanOptions::DPP) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); + ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); + std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, + ComputeLoop, ComputeEnd); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -671,6 +770,41 @@ Instruction *const SingleLaneTerminator = SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); + // Control flow is changed here after splitting I's block + assert(DTU.getDomTree().verify(DominatorTree::VerificationLevel::Full)); + + // At this point, we have split the I's block to allow one lane in wavefront + // to update the precomputed reduced value. Also, completed the codegen for + // new control flow i.e. iterative loop which perform reduction and scan using + // ComputeLoop and ComputeEnd. + // For the new control flow, we need to move branch instruction i.e. + // terminator created during SplitBlockAndInsertIfThen from I's block to + // ComputeEnd block. We also need to set up predecessor to next block when + // single lane done updating the final reduced value. + BasicBlock *Predecessor = nullptr; + if (ValDivergent && ScanImpl == ScanOptions::Iterative) { + // Move terminator from I's block to ComputeEnd block. + Instruction *Terminator = EntryBB->getTerminator(); + B.SetInsertPoint(ComputeEnd); + Terminator->removeFromParent(); + B.Insert(Terminator); + + // Branch to ComputeLoop Block unconditionally from the I's block for + // iterative approach. + B.SetInsertPoint(EntryBB); + B.CreateBr(ComputeLoop); + + // Update the dominator tree for new control flow. + DTU.applyUpdates( + {{DominatorTree::Insert, EntryBB, ComputeLoop}, + {DominatorTree::Insert, ComputeLoop, ComputeEnd}, + {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}}); + assert(DTU.getDomTree().verify(DominatorTree::VerificationLevel::Full)); + + Predecessor = ComputeEnd; + } else { + Predecessor = EntryBB; + } // Move the IR builder into single_lane next. B.SetInsertPoint(SingleLaneTerminator); @@ -687,7 +821,7 @@ if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(PoisonValue::get(Ty), EntryBB); + PHI->addIncoming(PoisonValue::get(Ty), Predecessor); PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); // We need to broadcast the value who was the lowest active lane (the first @@ -721,8 +855,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (ScanImpl == ScanOptions::DPP) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: @@ -771,6 +909,6 @@ INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) -FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { - return new AMDGPUAtomicOptimizer(); +FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { + return new AMDGPUAtomicOptimizer(ScanStrategy); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -275,11 +275,19 @@ cl::init(true), cl::Hidden); // Enable atomic optimization -static cl::opt EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt + EnableAtomicOptimizations("amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(false), cl::Hidden); + +static cl::opt AMDGPUAtomicOptimizerStrategy( + "amdgpu-atomic-optimizer-strategy", + cl::desc("Select DPP or Iterative strategy for scan"), + cl::init(ScanOptions::DPP), + cl::values(clEnumValN(ScanOptions::DPP, "DPP", + "Use DPP operations for scan"), + clEnumValN(ScanOptions::Iterative, "Iterative", + "Use Iterative approach for scan"))); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass( @@ -666,7 +674,8 @@ return true; } if (PassName == "amdgpu-atomic-optimizer") { - PM.addPass(AMDGPUAtomicOptimizerPass(*this)); + PM.addPass( + AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy)); return true; } if (PassName == "amdgpu-codegenprepare") { @@ -1138,7 +1147,7 @@ addPass(createAMDGPULateCodeGenPreparePass()); if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { - addPass(createAMDGPUAtomicOptimizerPass()); + addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); } if (TM->getOptLevel() > CodeGenOpt::None) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) @@ -449,313 +449,257 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -782,328 +726,273 @@ ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_2: +; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v5, s8 -; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_2: +; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB3_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1629,313 +1518,258 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -516,37 +516,33 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s7, s5, s4 +; GFX8-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -557,50 +553,45 @@ ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s7, s5, s4 +; GFX9-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -611,273 +602,214 @@ ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s7, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_add_i32 s6, s6, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB2_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2084,37 +2016,33 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s7, s5, s4 +; GFX8-NEXT: v_readlane_b32 s8, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB8_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2125,50 +2053,45 @@ ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: .LBB8_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s7, s5, s4 +; GFX9-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2179,273 +2102,214 @@ ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s7, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_add_i32 s6, s6, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB8_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s6 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB8_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB8_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB8_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,11 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -491,269 +491,226 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -761,53 +718,43 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -831,215 +778,199 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v2, v0 +; GFX8-NEXT: ds_add_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v2, v0 +; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: ds_add_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: +; GFX1064-NEXT: .LBB3_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: ds_add_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: +; GFX1032-NEXT: .LBB3_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v0, v3 +; GFX1164-NEXT: ds_add_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: +; GFX1164-NEXT: .LBB3_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v0, v3 +; GFX1132-NEXT: ds_add_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: .LBB3_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2140,269 +2071,226 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB9_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2410,53 +2298,43 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: +; GFX1132-NEXT: .LBB9_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2480,215 +2358,199 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB10_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB10_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB10_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v2, v0 +; GFX8-NEXT: ds_sub_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB10_2: +; GFX8-NEXT: .LBB10_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v2, v0 +; GFX9-NEXT: ds_sub_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB10_2: +; GFX9-NEXT: .LBB10_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: ds_sub_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_2: +; GFX1064-NEXT: .LBB10_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: ds_sub_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_2: +; GFX1032-NEXT: .LBB10_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB10_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v0, v3 +; GFX1164-NEXT: ds_sub_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_2: +; GFX1164-NEXT: .LBB10_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB10_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v0, v3 +; GFX1132-NEXT: ds_sub_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_2: +; GFX1132-NEXT: .LBB10_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3344,273 +3206,226 @@ ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB14_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB14_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_2: +; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_2: +; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_and_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB14_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_2: +; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_and_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB14_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_2: +; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_and_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB14_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB14_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3618,53 +3433,43 @@ ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_and_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB14_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_2: +; GFX1132-NEXT: .LBB14_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3694,269 +3499,226 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB15_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_2: +; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_or_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_2: +; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_or_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_2: +; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_or_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB15_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3964,53 +3726,43 @@ ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_or_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_2: +; GFX1132-NEXT: .LBB15_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4040,269 +3792,226 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_xor_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB16_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_2: +; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_xor_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_2: +; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_2: +; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB16_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB16_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4310,53 +4019,43 @@ ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB16_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_2: +; GFX1132-NEXT: .LBB16_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4386,273 +4085,226 @@ ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, 1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB17_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB17_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_2: +; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, 1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_2: +; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, 1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB17_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_2: +; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, 1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB17_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_2: +; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, 1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB17_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB17_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4660,53 +4312,43 @@ ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, 1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB17_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_2: +; GFX1132-NEXT: .LBB17_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4987,273 +4629,226 @@ ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB19_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB19_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_2: +; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_2: +; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, -2 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_2: +; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, -2 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_2: +; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, -2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB19_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB19_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5261,53 +4856,43 @@ ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, -2 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB19_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_2: +; GFX1132-NEXT: .LBB19_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5588,269 +5173,226 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB21_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB21_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_2: +; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_2: +; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB21_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_2: +; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB21_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_2: +; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB21_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB21_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5858,53 +5400,43 @@ ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB21_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_2: +; GFX1132-NEXT: .LBB21_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6180,273 +5712,226 @@ ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB23_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB23_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_2: +; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_2: +; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB23_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_2: +; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB23_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_2: +; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB23_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB23_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6454,53 +5939,43 @@ ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB23_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_2: +; GFX1132-NEXT: .LBB23_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32) @@ -448,313 +448,257 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1280,313 +1224,258 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: +; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB6_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB6_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32, i32) @@ -463,312 +463,263 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1411,312 +1362,264 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) , ptr addrspace(1) %val) #0 { +; IR-LABEL: @uniform_value( +; IR-NEXT: entry: +; IR-NEXT: [[UNIFORM_VALUE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; IR-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[UNIFORM_VALUE_KERNARG_SEGMENT]], i64 36 +; IR-NEXT: [[LOADED_OUT_KERNARG_OFFSET:%.*]] = load <2 x i64>, ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4 +; IR-NEXT: [[OUT_LOAD1:%.*]] = extractelement <2 x i64> [[LOADED_OUT_KERNARG_OFFSET]], i32 0 +; IR-NEXT: [[MEM_LOCATION:%.*]] = inttoptr i64 [[OUT_LOAD1]] to ptr addrspace(1) +; IR-NEXT: [[VAL_LOAD2:%.*]] = extractelement <2 x i64> [[LOADED_OUT_KERNARG_OFFSET]], i32 1 +; IR-NEXT: [[VALUE_ADDRESS:%.*]] = inttoptr i64 [[VAL_LOAD2]] to ptr addrspace(1) +; IR-NEXT: [[LANE:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x() +; IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[LANE]] to i64 +; IR-NEXT: [[ELE:%.*]] = getelementptr i32, ptr addrspace(1) [[VALUE_ADDRESS]], i64 [[IDXPROM]] +; IR-NEXT: [[VALUE:%.*]] = load i32, ptr addrspace(1) [[ELE]], align 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[MEM_LOCATION]], i32 4 +; IR-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-NEXT: [[TMP9:%.*]] = mul i32 [[VALUE]], [[TMP8]] +; IR-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP13:%.*]] +; IR: 11: +; IR-NEXT: [[TMP12:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP9]] seq_cst, align 4 +; IR-NEXT: br label [[TMP13]] +; IR: 13: +; IR-NEXT: ret void +; +entry: + %uniform_value.kernarg.segment = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uniform_value.kernarg.segment, i64 36 + %loaded.out.kernarg.offset = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4 + %out.load1 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 0 + %mem.location = inttoptr i64 %out.load1 to ptr addrspace(1) + %val.load2 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 1 + %value.address = inttoptr i64 %val.load2 to ptr addrspace(1) + %lane = tail call i32 @llvm.amdgcn.workgroup.id.x() + %idxprom = sext i32 %lane to i64 + %ele = getelementptr i32, ptr addrspace(1) %value.address, i64 %idxprom + %value = load i32, ptr addrspace(1) %ele, align 4 + %gep = getelementptr i32, ptr addrspace(1) %mem.location, i32 4 + %old = atomicrmw volatile add ptr addrspace(1) %gep, i32 %value seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(1) %val) #0 { +; IR-LABEL: @divergent_value( +; IR-NEXT: entry: +; IR-NEXT: [[DIVERGENT_VALUE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; IR-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[DIVERGENT_VALUE_KERNARG_SEGMENT]], i64 36 +; IR-NEXT: [[LOADED_OUT_KERNARG_OFFSET:%.*]] = load <2 x i64>, ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4 +; IR-NEXT: [[OUT_LOAD1:%.*]] = extractelement <2 x i64> [[LOADED_OUT_KERNARG_OFFSET]], i32 0 +; IR-NEXT: [[MEM_LOCATION:%.*]] = inttoptr i64 [[OUT_LOAD1]] to ptr addrspace(1) +; IR-NEXT: [[VAL_LOAD2:%.*]] = extractelement <2 x i64> [[LOADED_OUT_KERNARG_OFFSET]], i32 1 +; IR-NEXT: [[VALUE_ADDRESS:%.*]] = inttoptr i64 [[VAL_LOAD2]] to ptr addrspace(1) +; IR-NEXT: [[LANE:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[LANE]] to i64 +; IR-NEXT: [[ELE:%.*]] = getelementptr i32, ptr addrspace(1) [[VALUE_ADDRESS]], i64 [[IDXPROM]] +; IR-NEXT: [[VALUE:%.*]] = load i32, ptr addrspace(1) [[ELE]], align 4 +; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[MEM_LOCATION]], i32 4 +; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0) +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]]) +; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: br label [[COMPUTELOOP:%.*]] +; IR: 7: +; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP13:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP9:%.*]] +; IR: 9: +; IR-NEXT: ret void +; IR: ComputeLoop: +; IR-NEXT: [[ACCUMULATOR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP13]], [[COMPUTELOOP]] ] +; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[ENTRY]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ] +; IR-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; IR-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[VALUE]], i32 [[TMP11]]) +; IR-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]] +; IR-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]] +; IR-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1 +; IR-NEXT: [[TMP16]] = and i64 [[ACTIVEBITS]], [[TMP15]] +; IR-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP16]], 0 +; IR-NEXT: br i1 [[TMP17]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR: ComputeEnd: +; IR-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP5]], 0 +; IR-NEXT: br i1 [[TMP18]], label [[TMP7:%.*]], label [[TMP9]] +; +entry: + %divergent_value.kernarg.segment = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %divergent_value.kernarg.segment, i64 36 + %loaded.out.kernarg.offset = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4 + %out.load1 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 0 + %mem.location = inttoptr i64 %out.load1 to ptr addrspace(1) + %val.load2 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 1 + %value.address = inttoptr i64 %val.load2 to ptr addrspace(1) + %lane = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %lane to i64 + %ele = getelementptr i32, ptr addrspace(1) %value.address, i64 %idxprom + %value = load i32, ptr addrspace(1) %ele, align 4 + %gep = getelementptr i32, ptr addrspace(1) %mem.location, i32 4 + %old = atomicrmw volatile add ptr addrspace(1) %gep, i32 %value seq_cst, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + +attributes #0 = {"target-cpu"="gfx906"} +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none)} +