diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -85,7 +85,8 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -FunctionPass *createAMDGPUAtomicOptimizerPass(); +enum class ScanOptions : bool { DPP, Iterative }; +FunctionPass *createAMDGPUAtomicOptimizerPass(bool UseDpp); void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); extern char &AMDGPUAtomicOptimizerID; @@ -234,11 +235,13 @@ }; struct AMDGPUAtomicOptimizerPass : PassInfoMixin { - AMDGPUAtomicOptimizerPass(TargetMachine &TM) : TM(TM) {} + AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl) + : TM(TM), ScanImpl(ScanImpl) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); private: TargetMachine &TM; + ScanOptions ScanImpl; }; Pass *createAMDGPUStructurizeCFGPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -41,8 +41,9 @@ class AMDGPUAtomicOptimizer : public FunctionPass { public: static char ID; - - AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + ScanOptions ScanImpl; + AMDGPUAtomicOptimizer(ScanOptions ScanImpl) + : FunctionPass(ID), ScanImpl(ScanImpl) {} bool runOnFunction(Function &F) override; @@ -62,6 +63,7 @@ DominatorTree *DT; const GCNSubtarget *ST; bool IsPixelShader; + ScanOptions ScanImpl; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; @@ -69,6 +71,11 @@ Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *V, Instruction &I, + BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -77,8 +84,9 @@ AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, DominatorTree *DT, const GCNSubtarget *ST, - bool IsPixelShader) - : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {} + bool IsPixelShader, ScanOptions ScanImpl) + : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader), + ScanImpl(ScanImpl) {} bool run(Function &F); @@ -111,7 +119,8 @@ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; - return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F); + return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader, ScanImpl) + .run(F); } PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, @@ -125,7 +134,8 @@ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; - return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F) + return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader, ScanImpl) + .run(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); } @@ -470,6 +480,78 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over active lanes to read, compute and update the value using +// readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizerImpl::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, + Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { + + Type *const Ty = I.getType(); + unsigned WaveFrontSize = ST->isWave32() ? 32 : 64; + Type *const WaveTy = B.getIntNTy(WaveFrontSize); + BasicBlock *const EntryBB = I.getParent(); + const bool NeedResult = !I.use_empty(); + + Value *Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); + + // Branch to ComputeLoop Block unconditionally from the I's block + B.CreateBr(ComputeLoop); + + // Start inserting instructions for ComputeLoop block + B.SetInsertPoint(ComputeLoop); + // Phi nodes for Accumulator, Scan results destination, and Active Lanes + PHINode *const Accumulator = B.CreatePHI(Ty, 2, "accum"); + Accumulator->addIncoming(Identity, EntryBB); + PHINode *OldValuePhi = nullptr; + if (NeedResult) { + OldValuePhi = B.CreatePHI(Ty, 2, "old_value_phi"); + OldValuePhi->addIncoming(UndefValue::get(Ty), EntryBB); + } + PHINode *const ActiveBits = B.CreatePHI(WaveTy, 2, "active_bits"); + ActiveBits->addIncoming(Ballot, EntryBB); + + // Use llvm.cttz instrinsic to find the lowest remaining active lane. + Value *FF1 = + B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); + Value *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + // Get the value required for atomic operation + Value *LaneValue = + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + + // Perform writelane if intermediate scan results are required later in the + // kernel computations + Value *OldValue = nullptr; + if (NeedResult) { + OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, + {Accumulator, LaneIdxInt, OldValuePhi}); + OldValuePhi->addIncoming(OldValue, ComputeLoop); + } + + // Accumulate the results + Value *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); + Accumulator->addIncoming(NewAccumulator, ComputeLoop); + + // Set bit to zero of current active lane so that for next iteration llvm.cttz + // return the next active lane + Value *Mask = B.CreateShl(B.getIntN(WaveFrontSize, 1), FF1); + Value *InverseMask = B.CreateXor(Mask, B.getIntN(WaveFrontSize, -1)); + Value *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); + ActiveBits->addIncoming(NewActiveBits, ComputeLoop); + + // Branch out of the loop when all lanes are processed. + Value *IsEnd = B.CreateICmpEQ(NewActiveBits, B.getIntN(WaveFrontSize, 0)); + B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); + + B.SetInsertPoint(ComputeEnd); + + return std::make_pair(OldValue, NewAccumulator); +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -568,36 +650,48 @@ const bool NeedResult = !I.use_empty(); + Function *F = I.getFunction(); + LLVMContext &C = I.getParent()->getContext(); + BasicBlock *ComputeLoop = nullptr; + BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (ScanImpl == ScanOptions::DPP) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); + ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); + std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, + ComputeLoop, ComputeEnd); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -650,6 +744,16 @@ Instruction *const SingleLaneTerminator = SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + BasicBlock *Predecessor = nullptr; + if (ValDivergent && ScanImpl == ScanOptions::Iterative) { + Instruction *Terminator = EntryBB->getTerminator(); + B.SetInsertPoint(ComputeEnd); + Terminator->removeFromParent(); + B.Insert(Terminator); + Predecessor = ComputeEnd; + } else { + Predecessor = EntryBB; + } // Move the IR builder into single_lane next. B.SetInsertPoint(SingleLaneTerminator); @@ -666,7 +770,7 @@ if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(PoisonValue::get(Ty), EntryBB); + PHI->addIncoming(PoisonValue::get(Ty), Predecessor); PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); // We need to broadcast the value who was the lowest active lane (the first @@ -700,8 +804,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (ScanImpl == ScanOptions::DPP) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: @@ -750,6 +858,7 @@ INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) -FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { - return new AMDGPUAtomicOptimizer(); +FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(bool IsUseDpp) { + return new AMDGPUAtomicOptimizer(IsUseDpp ? ScanOptions::DPP + : ScanOptions::Iterative); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -269,12 +269,15 @@ cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden); +static cl::opt EnableAtomicOptimizationsUsingDPP( + "amdgpu-atomic-optimizer-use-dpp", + cl::desc("Use DPP in the atomic optimizer"), cl::init(true), cl::Hidden); + // Enable atomic optimization -static cl::opt EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt + EnableAtomicOptimizations("amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(false), cl::Hidden); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass( @@ -655,7 +658,10 @@ return true; } if (PassName == "amdgpu-atomic-optimizer") { - PM.addPass(AMDGPUAtomicOptimizerPass(*this)); + PM.addPass( + AMDGPUAtomicOptimizerPass(*this, EnableAtomicOptimizationsUsingDPP + ? ScanOptions::DPP + : ScanOptions::Iterative)); return true; } return false; @@ -1121,7 +1127,7 @@ addPass(createAMDGPULateCodeGenPreparePass()); if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { - addPass(createAMDGPUAtomicOptimizerPass()); + addPass(createAMDGPUAtomicOptimizerPass(EnableAtomicOptimizationsUsingDPP)); } if (TM->getOptLevel() > CodeGenOpt::None) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) @@ -449,313 +449,257 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -782,328 +726,273 @@ ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_2: +; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v5, s8 -; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_2: +; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB3_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1629,313 +1518,258 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -517,36 +517,32 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s7, s5 +; GFX8-NEXT: s_ff1_i32_b32 s8, s4 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s7, s8, s7 +; GFX8-NEXT: v_readlane_b32 s10, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s10 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -557,50 +553,45 @@ ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s7, s5 +; GFX9-NEXT: s_ff1_i32_b32 s8, s4 +; GFX9-NEXT: s_add_i32 s7, s7, 32 +; GFX9-NEXT: s_min_u32 s7, s8, s7 +; GFX9-NEXT: v_readlane_b32 s10, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -611,273 +602,214 @@ ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s7, s5 +; GFX1064-NEXT: s_ff1_i32_b32 s8, s4 +; GFX1064-NEXT: s_add_i32 s7, s7, 32 +; GFX1064-NEXT: s_min_u32 s7, s8, s7 +; GFX1064-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064-NEXT: s_add_i32 s6, s6, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1032-NEXT: s_andn2_b32 s5, s5, s8 +; GFX1032-NEXT: s_add_i32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s7, s5 +; GFX1164-NEXT: s_ctz_i32_b32 s8, s4 +; GFX1164-NEXT: s_add_i32 s7, s7, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s8, s7 +; GFX1164-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s10 +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s6, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1132-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1132-NEXT: s_and_not1_b32 s5, s5, s8 +; GFX1132-NEXT: s_add_i32 s4, s4, s7 +; GFX1132-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB2_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2085,36 +2017,32 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s7, s5 +; GFX8-NEXT: s_ff1_i32_b32 s8, s4 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s7, s8, s7 +; GFX8-NEXT: v_readlane_b32 s10, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s10 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB8_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2125,50 +2053,45 @@ ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: .LBB8_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s7, s5 +; GFX9-NEXT: s_ff1_i32_b32 s8, s4 +; GFX9-NEXT: s_add_i32 s7, s7, 32 +; GFX9-NEXT: s_min_u32 s7, s8, s7 +; GFX9-NEXT: v_readlane_b32 s10, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2179,273 +2102,214 @@ ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s7, s5 +; GFX1064-NEXT: s_ff1_i32_b32 s8, s4 +; GFX1064-NEXT: s_add_i32 s7, s7, 32 +; GFX1064-NEXT: s_min_u32 s7, s8, s7 +; GFX1064-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064-NEXT: s_add_i32 s6, s6, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB8_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1032-NEXT: s_andn2_b32 s5, s5, s8 +; GFX1032-NEXT: s_add_i32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB8_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s7, s5 +; GFX1164-NEXT: s_ctz_i32_b32 s8, s4 +; GFX1164-NEXT: s_add_i32 s7, s7, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s8, s7 +; GFX1164-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s10 +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB8_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s6, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1132-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1132-NEXT: s_and_not1_b32 s5, s5, s8 +; GFX1132-NEXT: s_add_i32 s4, s4, s7 +; GFX1132-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB8_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,11 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -491,269 +491,226 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -761,53 +718,43 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -831,215 +778,199 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v2, v0 +; GFX8-NEXT: ds_add_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v2, v0 +; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: ds_add_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: +; GFX1064-NEXT: .LBB3_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: ds_add_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: +; GFX1032-NEXT: .LBB3_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v0, v3 +; GFX1164-NEXT: ds_add_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: +; GFX1164-NEXT: .LBB3_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v0, v3 +; GFX1132-NEXT: ds_add_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: .LBB3_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2140,269 +2071,226 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB9_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2410,53 +2298,43 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: +; GFX1132-NEXT: .LBB9_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2480,215 +2358,199 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB10_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB10_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB10_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v2, v0 +; GFX8-NEXT: ds_sub_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB10_2: +; GFX8-NEXT: .LBB10_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB10_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v2, v0 +; GFX9-NEXT: ds_sub_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB10_2: +; GFX9-NEXT: .LBB10_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: ds_sub_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_2: +; GFX1064-NEXT: .LBB10_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: ds_sub_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_2: +; GFX1032-NEXT: .LBB10_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB10_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v0, v3 +; GFX1164-NEXT: ds_sub_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_2: +; GFX1164-NEXT: .LBB10_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB10_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v0, v3 +; GFX1132-NEXT: ds_sub_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_2: +; GFX1132-NEXT: .LBB10_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3344,273 +3206,226 @@ ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB14_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB14_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_2: +; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_2: +; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_and_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB14_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_2: +; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_and_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB14_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_2: +; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_and_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB14_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB14_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3618,53 +3433,43 @@ ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_and_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB14_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_2: +; GFX1132-NEXT: .LBB14_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3694,269 +3499,226 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB15_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_2: +; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_or_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_2: +; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_or_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_2: +; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_or_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB15_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3964,53 +3726,43 @@ ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_or_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_2: +; GFX1132-NEXT: .LBB15_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4040,269 +3792,226 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_xor_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB16_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_2: +; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_xor_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_2: +; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_2: +; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB16_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB16_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4310,53 +4019,43 @@ ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB16_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_2: +; GFX1132-NEXT: .LBB16_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4386,273 +4085,226 @@ ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, 1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB17_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB17_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_2: +; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, 1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_2: +; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, 1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB17_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_2: +; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, 1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB17_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_2: +; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, 1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB17_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB17_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4660,53 +4312,43 @@ ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, 1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB17_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_2: +; GFX1132-NEXT: .LBB17_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4987,273 +4629,226 @@ ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB19_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB19_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_2: +; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_2: +; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, -2 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_2: +; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, -2 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_2: +; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, -2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB19_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB19_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5261,53 +4856,43 @@ ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, -2 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB19_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_2: +; GFX1132-NEXT: .LBB19_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5588,269 +5173,226 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB21_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB21_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_2: +; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_2: +; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB21_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_2: +; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB21_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_2: +; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB21_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB21_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5858,53 +5400,43 @@ ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB21_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_2: +; GFX1132-NEXT: .LBB21_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6180,273 +5712,226 @@ ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB23_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB23_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_2: +; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_2: +; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB23_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_2: +; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB23_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_2: +; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB23_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB23_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6454,53 +5939,43 @@ ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB23_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_2: +; GFX1132-NEXT: .LBB23_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) @@ -448,313 +448,257 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1280,313 +1224,258 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: +; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB6_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB6_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) @@ -463,312 +463,263 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1411,312 +1362,264 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: