diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -47,12 +47,17 @@ DominatorTree *DT; const GCNSubtarget *ST; bool IsPixelShader; + bool IsGraphicsShader; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *V, Instruction &I, + BasicBlock *Compute, BasicBlock *ComputeEnd) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,6 +98,7 @@ const TargetMachine &TM = TPC.getTM(); ST = &TM.getSubtarget(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + IsGraphicsShader = AMDGPU::isGraphics(F.getCallingConv()); visit(F); @@ -430,6 +436,83 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over active lanes to read, compute and update the value using +// readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizer::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, + Instruction &I, BasicBlock *Compute, BasicBlock *ComputeEnd) const { + + Type *const Ty = I.getType(); + Module *M = B.GetInsertBlock()->getModule(); + unsigned WaveFrontSize = ST->isWave32() ? 32 : 64; + Type *const WaveTy = B.getIntNTy(WaveFrontSize); + BasicBlock *const EntryBB = I.getParent(); + const bool NeedResult = !I.use_empty(); + + Value *Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); + Type *const BallotTy = Ballot->getType(); + const unsigned TyBitWidth = DL->getTypeSizeInBits(BallotTy); + + // Branch to Compute Block unconditionally from the I's block + B.CreateBr(Compute); + + // Start inserting instructions for Compute block + B.SetInsertPoint(Compute); + // Phi nodes for Accumulator, Scan results destination, and Active Lanes + PHINode *const Accumulator = B.CreatePHI(Ty, 2, "accum"); + Accumulator->addIncoming(Identity, EntryBB); + PHINode *OldValuePhi = nullptr; + if (NeedResult) { + OldValuePhi = B.CreatePHI(Ty, 2, "old_value_phi"); + OldValuePhi->addIncoming(V, EntryBB); + } + PHINode *const ActiveBits = B.CreatePHI(BallotTy, 2, "active_bits"); + ActiveBits->addIncoming(Ballot, EntryBB); + + // Call llvm.cttz instrinsic to find the lowest remaining active lane. + Function *CttzDecl = Intrinsic::getDeclaration(M, Intrinsic::cttz, WaveTy); + Value *FF1 = B.CreateCall(CttzDecl, {ActiveBits, B.getTrue()}); + Value *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + // Get the value required for atomic operation + Value *LaneValue = + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + + // Perfrom writelane if intermidiate scan results are required later in the + // kernel computations + Value *OldValue = nullptr; + if (NeedResult) { + Function *WriteLaneDecl = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + OldValue = + B.CreateCall(WriteLaneDecl, {Accumulator, LaneIdxInt, OldValuePhi}); + OldValuePhi->addIncoming(OldValue, Compute); + } + + // Accumulate the results + Value *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); + Accumulator->addIncoming(NewAccumulator, Compute); + + // Set bit to zero of current active lane so that for next iteration llvm.cttz + // return the next active lane + Value *Mask = B.CreateShl(B.getIntN(TyBitWidth, 1), FF1); + Value *InverseMask = B.CreateXor(Mask, B.getIntN(TyBitWidth, -1)); + Value *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); + ActiveBits->addIncoming(NewActiveBits, Compute); + + // Branch out of the loop when all lanes are processed. + Value *IsEnd = B.CreateICmpEQ(NewActiveBits, B.getIntN(TyBitWidth, 0)); + B.CreateCondBr(IsEnd, ComputeEnd, Compute); + + B.SetInsertPoint(ComputeEnd); + + return std::make_pair(OldValue, NewAccumulator); +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -528,36 +611,50 @@ const bool NeedResult = !I.use_empty(); + llvm::Function *F = I.getParent()->getParent(); + LLVMContext &C = I.getParent()->getContext(); + BasicBlock *Compute = nullptr; + BasicBlock *ComputeEnd = nullptr; + if (ValDivergent && !IsGraphicsShader) { + Compute = BasicBlock::Create(C, "Compute", F); + ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); + } // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (IsGraphicsShader) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + std::tie(ExclScan, NewV) = + buildScanIteratively(B, ScanOp, Identity, V, I, Compute, ComputeEnd); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -594,11 +691,19 @@ } } - // We only want a single lane to enter our new control flow, and we do this - // by checking if there are any active lanes below us. Only one lane will - // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); - + Value *Cond = nullptr; + if (ValDivergent && !IsGraphicsShader) { + // Only the first active lane will enter the new control flow to update the + // value. + CallInst *const FirstActiveLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt); + Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane); + } else { + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + } // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -610,6 +715,17 @@ Instruction *const SingleLaneTerminator = SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + BasicBlock *Predecessor = nullptr; + if (ValDivergent && !IsGraphicsShader) { + Instruction *Terminator = EntryBB->getTerminator(); + B.SetInsertPoint(ComputeEnd); + Instruction *NewTerminator = Terminator->clone(); + Terminator->eraseFromParent(); + B.Insert(NewTerminator); + Predecessor = ComputeEnd; + } else { + Predecessor = EntryBB; + } // Move the IR builder into single_lane next. B.SetInsertPoint(SingleLaneTerminator); @@ -626,7 +742,7 @@ if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(PoisonValue::get(Ty), EntryBB); + PHI->addIncoming(PoisonValue::get(Ty), Predecessor); PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); // We need to broadcast the value who was the lowest active lane (the first @@ -660,8 +776,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (IsGraphicsShader) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -270,11 +270,10 @@ cl::init(true), cl::Hidden); // Enable atomic optimization -static cl::opt EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt + EnableAtomicOptimizations("amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(true), cl::Hidden); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass( diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -902,14 +902,36 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB32_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB32_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_movk_i32 s4, 0x3ffc +; GFX6-NEXT: s_movk_i32 s8, 0x3ffc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX6-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB32_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB32_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -917,14 +939,36 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB32_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB32_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_movk_i32 s4, 0x3ffc +; GFX7-NEXT: s_movk_i32 s8, 0x3ffc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX7-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB32_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB32_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst @@ -935,37 +979,81 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) { ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB33_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB33_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_mov_b32 s9, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_mov_b32 s2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB33_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB33_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0 -; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB33_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_mov_b32 s8, 0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB33_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: s_mov_b32 s9, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_mov_b32 s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB33_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB33_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -449,313 +449,265 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB2_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB2_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB2_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB2_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB2_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB2_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -782,328 +734,281 @@ ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB3_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB3_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB3_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_2: +; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB3_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v5, s8 -; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_2: +; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB3_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB3_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB3_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1629,313 +1534,266 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB7_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB7_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB7_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB7_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB7_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB7_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -517,36 +517,33 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB2_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s7, s5 +; GFX8-NEXT: s_ff1_i32_b32 s8, s4 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s7, s8, s7 +; GFX8-NEXT: v_readlane_b32 s10, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s10 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -557,50 +554,46 @@ ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB2_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s7, s5 +; GFX9-NEXT: s_ff1_i32_b32 s8, s4 +; GFX9-NEXT: s_add_i32 s7, s7, 32 +; GFX9-NEXT: s_min_u32 s7, s8, s7 +; GFX9-NEXT: v_readlane_b32 s10, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -611,273 +604,220 @@ ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: .LBB2_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s7, s5 +; GFX1064-NEXT: s_ff1_i32_b32 s8, s4 +; GFX1064-NEXT: s_add_i32 s7, s7, 32 +; GFX1064-NEXT: s_min_u32 s7, s8, s7 +; GFX1064-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064-NEXT: s_add_i32 s6, s6, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: .LBB2_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1032-NEXT: s_andn2_b32 s5, s5, s8 +; GFX1032-NEXT: s_add_i32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: .LBB2_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s7, s5 +; GFX1164-NEXT: s_ctz_i32_b32 s8, s4 +; GFX1164-NEXT: s_add_i32 s7, s7, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s8, s7 +; GFX1164-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s10 +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s4, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: .LBB2_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s6, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1132-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1132-NEXT: s_and_not1_b32 s5, s5, s8 +; GFX1132-NEXT: s_add_i32 s4, s4, s7 +; GFX1132-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB2_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2085,36 +2025,33 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB8_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s7, s5 +; GFX8-NEXT: s_ff1_i32_b32 s8, s4 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s7, s8, s7 +; GFX8-NEXT: v_readlane_b32 s10, v0, s7 +; GFX8-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX8-NEXT: s_mov_b32 m0, s7 +; GFX8-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8-NEXT: s_add_i32 s6, s6, s10 +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB8_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2125,50 +2062,46 @@ ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: .LBB8_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB8_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s7, s5 +; GFX9-NEXT: s_ff1_i32_b32 s8, s4 +; GFX9-NEXT: s_add_i32 s7, s7, 32 +; GFX9-NEXT: s_min_u32 s7, s8, s7 +; GFX9-NEXT: v_readlane_b32 s10, v0, s7 +; GFX9-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX9-NEXT: s_mov_b32 m0, s7 +; GFX9-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9-NEXT: s_add_i32 s6, s6, s10 +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2179,273 +2112,220 @@ ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: .LBB8_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s7, s5 +; GFX1064-NEXT: s_ff1_i32_b32 s8, s4 +; GFX1064-NEXT: s_add_i32 s7, s7, 32 +; GFX1064-NEXT: s_min_u32 s7, s8, s7 +; GFX1064-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1064-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1064-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064-NEXT: s_add_i32 s6, s6, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB8_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: .LBB8_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s6, s5 +; GFX1032-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1032-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1032-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1032-NEXT: s_andn2_b32 s5, s5, s8 +; GFX1032-NEXT: s_add_i32 s4, s4, s7 +; GFX1032-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1032-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB8_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b32 s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: .LBB8_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s7, s5 +; GFX1164-NEXT: s_ctz_i32_b32 s8, s4 +; GFX1164-NEXT: s_add_i32 s7, s7, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s7, s8, s7 +; GFX1164-NEXT: v_readlane_b32 s10, v0, s7 +; GFX1164-NEXT: s_lshl_b64 s[8:9], 1, s7 +; GFX1164-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s6, s6, s10 +; GFX1164-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s4, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s4, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB8_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: .LBB8_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s6, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1132-NEXT: s_lshl_b32 s8, 1, s6 +; GFX1132-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1132-NEXT: s_and_not1_b32 s5, s5, s8 +; GFX1132-NEXT: s_add_i32 s4, s4, s7 +; GFX1132-NEXT: s_cmp_lg_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB8_4: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s @@ -491,269 +491,232 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB2_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB2_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB2_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB2_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB2_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB2_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -761,53 +724,45 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB2_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -831,215 +786,207 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB3_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v2, v0 +; GFX8-NEXT: ds_add_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB3_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v2, v0 +; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB3_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: ds_add_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: +; GFX1064-NEXT: .LBB3_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: ds_add_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: +; GFX1032-NEXT: .LBB3_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB3_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v0, v3 +; GFX1164-NEXT: ds_add_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: +; GFX1164-NEXT: .LBB3_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v0, v3 +; GFX1132-NEXT: ds_add_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: .LBB3_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -2140,269 +2087,232 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB9_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB9_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB9_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB9_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_add_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB9_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_add_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB9_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_add_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2410,53 +2320,45 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB9_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_add_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: +; GFX1132-NEXT: .LBB9_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2480,215 +2382,207 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB10_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX8-NEXT: .LBB10_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s3, s1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s0 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s3, s4, s3 +; GFX8-NEXT: v_readlane_b32 s6, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_cbranch_execz .LBB10_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v2, v0 +; GFX8-NEXT: ds_sub_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB10_2: +; GFX8-NEXT: .LBB10_4: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: .LBB10_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s0 +; GFX9-NEXT: s_add_i32 s3, s3, 32 +; GFX9-NEXT: s_min_u32 s3, s4, s3 +; GFX9-NEXT: v_readlane_b32 s6, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v2, v0 +; GFX9-NEXT: ds_sub_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB10_2: +; GFX9-NEXT: .LBB10_4: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB10_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s0 +; GFX1064-NEXT: s_add_i32 s3, s3, 32 +; GFX1064-NEXT: s_min_u32 s3, s4, s3 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064-NEXT: s_add_i32 s2, s2, s6 +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: ds_sub_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_2: +; GFX1064-NEXT: .LBB10_4: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB10_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032-NEXT: s_add_i32 s0, s0, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: ds_sub_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_2: +; GFX1032-NEXT: .LBB10_4: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-NEXT: s_cbranch_execz .LBB10_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB10_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s4, s0 +; GFX1164-NEXT: s_add_i32 s3, s3, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s3, s4, s3 +; GFX1164-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164-NEXT: s_add_i32 s2, s2, s6 +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v0, v3 +; GFX1164-NEXT: ds_sub_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_2: +; GFX1164-NEXT: .LBB10_4: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB10_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1132-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_add_i32 s0, s0, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-NEXT: s_cbranch_execz .LBB10_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX1132-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v0, v3 +; GFX1132-NEXT: ds_sub_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_2: +; GFX1132-NEXT: .LBB10_4: ; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3344,273 +3238,232 @@ ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB14_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB14_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB14_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_2: +; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB14_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_2: +; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB14_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_and_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB14_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_2: +; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: .LBB14_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_and_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB14_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_2: +; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB14_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_and_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB14_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB14_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3618,53 +3471,45 @@ ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: .LBB14_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_and_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB14_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_2: +; GFX1132-NEXT: .LBB14_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3694,269 +3539,232 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB15_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB15_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_2: +; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB15_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_or_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB15_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_or_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_2: +; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB15_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_or_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_2: +; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB15_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_or_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB15_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3964,53 +3772,45 @@ ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB15_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_or_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_2: +; GFX1132-NEXT: .LBB15_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4040,269 +3840,232 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB16_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_xor_b32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB16_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_2: +; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB16_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_xor_b32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB16_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_2: +; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB16_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_2: +; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB16_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB16_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB16_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4310,53 +4073,45 @@ ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB16_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB16_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_2: +; GFX1132-NEXT: .LBB16_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4386,273 +4141,232 @@ ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB17_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB17_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB17_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_2: +; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB17_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_2: +; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, 1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB17_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB17_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_2: +; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, 1 +; GFX1032-NEXT: .LBB17_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB17_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_2: +; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: max_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: max_i32_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, 1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: .LBB17_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB17_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB17_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4660,53 +4374,45 @@ ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, 1 +; GFX1132-NEXT: .LBB17_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB17_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_2: +; GFX1132-NEXT: .LBB17_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4987,273 +4693,232 @@ ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB19_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB19_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB19_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_2: +; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB19_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_2: +; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_brev_b32 s4, -2 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB19_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_i32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_2: +; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_brev_b32 s2, -2 +; GFX1032-NEXT: .LBB19_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_i32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_2: +; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_brev_b32 s4, -2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB19_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_i32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB19_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB19_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5261,53 +4926,45 @@ ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_brev_b32 s2, -2 +; GFX1132-NEXT: .LBB19_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_i32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB19_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_2: +; GFX1132-NEXT: .LBB19_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5588,269 +5245,232 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB21_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_max_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB21_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB21_4 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_2: +; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB21_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_max_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_2: +; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB21_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_max_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB21_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_2: +; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB21_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_max_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB21_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_2: +; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB21_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_max_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB21_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB21_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5858,53 +5478,45 @@ ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB21_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_max_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB21_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_2: +; GFX1132-NEXT: .LBB21_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6180,273 +5792,232 @@ ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: s_mov_b32 s4, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB23_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_min_u32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB23_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB23_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_2: +; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX9-NEXT: s_mov_b32 s4, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: .LBB23_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_min_u32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_2: +; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b32 s4, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: .LBB23_1: ; %Compute +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s6, s2 +; GFX1064-NEXT: s_add_i32 s5, s5, 32 +; GFX1064-NEXT: s_min_u32 s5, s6, s5 +; GFX1064-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1064-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1064-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064-NEXT: s_min_u32 s4, s4, s8 +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB23_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_2: +; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: .LBB23_1: ; %Compute +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1032-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1032-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1032-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032-NEXT: s_min_u32 s2, s2, s5 +; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB23_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_2: +; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s4, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: .LBB23_1: ; %Compute +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s6, s2 +; GFX1164-NEXT: s_add_i32 s5, s5, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s5, s6, s5 +; GFX1164-NEXT: v_readlane_b32 s8, v0, s5 +; GFX1164-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX1164-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: s_min_u32 s4, s4, s8 +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB23_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB23_4: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6454,53 +6025,45 @@ ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: .LBB23_1: ; %Compute +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1132-NEXT: s_lshl_b32 s6, 1, s4 +; GFX1132-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132-NEXT: s_min_u32 s2, s2, s5 +; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB23_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_2: +; GFX1132-NEXT: .LBB23_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -448,313 +448,265 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB2_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB2_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB2_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB2_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB2_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB2_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1280,313 +1232,266 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: .LBB6_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: +; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB6_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB6_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB6_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB6_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB6_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB6_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: .LBB6_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -463,312 +463,271 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB2_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB2_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB2_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB2_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB2_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB2_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB2_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1411,312 +1370,272 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: .LBB7_1: ; %Compute +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s5, s3 +; GFX8-NEXT: s_ff1_i32_b32 s6, s2 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s5, s6, s5 +; GFX8-NEXT: v_readlane_b32 s8, v0, s5 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_writelane_b32 v1, s4, m0 +; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: .LBB7_1: ; %Compute +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s5, s3 +; GFX9-NEXT: s_ff1_i32_b32 s6, s2 +; GFX9-NEXT: s_add_i32 s5, s5, 32 +; GFX9-NEXT: s_min_u32 s5, s6, s5 +; GFX9-NEXT: v_readlane_b32 s8, v0, s5 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: v_writelane_b32 v1, s4, m0 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec +; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: .LBB7_1: ; %Compute +; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3 +; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2 +; GFX10W64-NEXT: s_add_i32 s5, s5, 32 +; GFX10W64-NEXT: s_min_u32 s5, s6, s5 +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 +; GFX10W32-NEXT: .LBB7_1: ; %Compute +; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 +; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX10W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: .LBB7_1: ; %Compute +; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3 +; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2 +; GFX11W64-NEXT: s_add_i32 s5, s5, 32 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_min_u32 s5, s6, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: .LBB7_1: ; %Compute +; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd +; GFX11W32-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: .LBB7_4: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -4,6 +4,7 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) dereferenceable(18446744073709551615) %arg0, i32 %arg1) { ; GCN-LABEL: name: mmo_offsets0 ; GCN: bb.0.bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: liveins: $sgpr0, $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -32,165 +33,545 @@ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 80, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]].sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY3]], [[COPY5]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY4]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY6]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_]], killed [[V_MOV_B32_dpp]], 0, implicit $exec + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_ADD_U32_e64_]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[V_MOV_B32_dpp1]], 0, implicit $exec + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_U32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_1]], killed [[V_MOV_B32_dpp2]], 0, implicit $exec + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_U32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_2]], killed [[V_MOV_B32_dpp3]], 0, implicit $exec + ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_U32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_3]], killed [[V_MOV_B32_dpp4]], 0, implicit $exec + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_ADD_U32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_4]], killed [[V_MOV_B32_dpp5]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_5]], killed [[S_MOV_B32_1]] + ; GCN-NEXT: early-clobber %1:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1 (%ir-block.25): + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %1 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY12]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_2]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 80, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2 (%ir-block.27): + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY13]].sub0 + ; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY13]].sub1 + ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY14]], [[COPY16]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY15]], killed [[V_MBCNT_LO_U32_B32_e64_1]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_1:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_3]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_SET_INACTIVE_B32_1]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_1]], killed [[V_MOV_B32_dpp6]], 0, implicit $exec + ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp7:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY18]], [[V_ADD_U32_e64_6]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_6]], killed [[V_MOV_B32_dpp7]], 0, implicit $exec + ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp8:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY19]], [[V_ADD_U32_e64_7]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_7]], killed [[V_MOV_B32_dpp8]], 0, implicit $exec + ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp9:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY20]], [[V_ADD_U32_e64_8]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_8]], killed [[V_MOV_B32_dpp9]], 0, implicit $exec + ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp10:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY21]], [[V_ADD_U32_e64_9]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_9]], killed [[V_MOV_B32_dpp10]], 0, implicit $exec + ; GCN-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp11:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY22]], [[V_ADD_U32_e64_10]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_10]], killed [[V_MOV_B32_dpp11]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_11]], killed [[S_MOV_B32_4]] + ; GCN-NEXT: early-clobber %3:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_1]], [[S_MOV_B32_3]], implicit $exec + ; GCN-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_1]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3 (%ir-block.53): + ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY %3 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY23]], killed [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4 (%ir-block.55): + ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 96, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 96, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 112, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 112, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY2]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY24:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET1]] + ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 64 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY25:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET2]] + ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY26:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET3]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY27]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY3]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY28:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]] + ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 72 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY29:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]] + ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY30:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY4]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY32]].sub0 + ; GCN-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[COPY32]].sub1 + ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY33]], [[COPY35]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY34]], killed [[V_MBCNT_LO_U32_B32_e64_2]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_2:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_6]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp12:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY36]], [[V_SET_INACTIVE_B32_2]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_2]], killed [[V_MOV_B32_dpp12]], 0, implicit $exec + ; GCN-NEXT: [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp13:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY37]], [[V_ADD_U32_e64_12]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_12]], killed [[V_MOV_B32_dpp13]], 0, implicit $exec + ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp14:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY38]], [[V_ADD_U32_e64_13]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_13]], killed [[V_MOV_B32_dpp14]], 0, implicit $exec + ; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp15:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY39]], [[V_ADD_U32_e64_14]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_14]], killed [[V_MOV_B32_dpp15]], 0, implicit $exec + ; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp16:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY40]], [[V_ADD_U32_e64_15]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_15]], killed [[V_MOV_B32_dpp16]], 0, implicit $exec + ; GCN-NEXT: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp17:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY41]], [[V_ADD_U32_e64_16]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_16]], killed [[V_MOV_B32_dpp17]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_2:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_17]], killed [[S_MOV_B32_11]] + ; GCN-NEXT: early-clobber %15:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_2]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_2]], [[S_MOV_B32_6]], implicit $exec + ; GCN-NEXT: [[SI_IF2:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_2]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5 (%ir-block.81): + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY42:%[0-9]+]]:vgpr_32 = COPY %15 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY42]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_12]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6 (%ir-block.83): + ; GCN-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY43:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY44:%[0-9]+]]:sreg_32 = COPY [[COPY43]].sub0 + ; GCN-NEXT: [[COPY45:%[0-9]+]]:sreg_32 = COPY [[COPY43]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_3:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY44]], [[V_MOV_B32_e32_4]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_3:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY45]], killed [[V_MBCNT_LO_U32_B32_e64_3]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_3:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_4]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp18:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_SET_INACTIVE_B32_3]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_3]], killed [[V_MOV_B32_dpp18]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp19:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_18]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_18]], killed [[V_MOV_B32_dpp19]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp20:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_19]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_19]], killed [[V_MOV_B32_dpp20]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp21:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_20]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_20]], killed [[V_MOV_B32_dpp21]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp22:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_21]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_21]], killed [[V_MOV_B32_dpp22]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp23:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_22]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_22]], killed [[V_MOV_B32_dpp23]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_3:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_23]], killed [[S_MOV_B32_13]] + ; GCN-NEXT: early-clobber %17:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_3]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_3]], [[V_MOV_B32_e32_4]], implicit $exec + ; GCN-NEXT: [[SI_IF3:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_3]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.7 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7 (%ir-block.109): + ; GCN-NEXT: successors: %bb.8(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 80 + ; GCN-NEXT: [[COPY46:%[0-9]+]]:vgpr_32 = COPY %17 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY46]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_14]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.8 (%ir-block.111): + ; GCN-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY47:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY47]].sub0 + ; GCN-NEXT: [[COPY49:%[0-9]+]]:sreg_32 = COPY [[COPY47]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_4:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY48]], [[V_MOV_B32_e32_5]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_4:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY49]], killed [[V_MBCNT_LO_U32_B32_e64_4]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_4:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_5]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp24:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_SET_INACTIVE_B32_4]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_4]], killed [[V_MOV_B32_dpp24]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp25:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_24]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_24]], killed [[V_MOV_B32_dpp25]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp26:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_25]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_25]], killed [[V_MOV_B32_dpp26]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp27:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_26]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_26]], killed [[V_MOV_B32_dpp27]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp28:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_27]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_27]], killed [[V_MOV_B32_dpp28]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp29:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_28]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_28]], killed [[V_MOV_B32_dpp29]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_4:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_29]], killed [[S_MOV_B32_15]] + ; GCN-NEXT: early-clobber %19:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_4]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_4:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_4]], [[V_MOV_B32_e32_5]], implicit $exec + ; GCN-NEXT: [[SI_IF4:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_4]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.9 (%ir-block.137): + ; GCN-NEXT: successors: %bb.10(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY %19 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_16]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.10 (%ir-block.139): + ; GCN-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF4]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY51:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY51]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY5]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 88 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 176 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY54:%[0-9]+]]:vreg_128 = COPY [[COPY24]] + ; GCN-NEXT: [[COPY55:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY54]], [[S_LOAD_DWORDX4_IMM]], [[COPY55]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 96 + ; GCN-NEXT: [[COPY56:%[0-9]+]]:vreg_128 = COPY [[COPY25]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY56]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_20]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 192 + ; GCN-NEXT: [[COPY57:%[0-9]+]]:vreg_128 = COPY [[COPY26]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY57]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY58]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY59:%[0-9]+]]:vreg_128 = COPY [[COPY28]] + ; GCN-NEXT: [[COPY60:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[COPY60]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 104 + ; GCN-NEXT: [[COPY61:%[0-9]+]]:vreg_128 = COPY [[COPY29]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY61]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_22]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 208 + ; GCN-NEXT: [[COPY62:%[0-9]+]]:vreg_128 = COPY [[COPY30]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY62]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY63]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY8]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY9]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY10]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY11]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY64:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY64]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY65:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN2]] + ; GCN-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sreg_32 = S_MOV_B32 112 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_24]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY66:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN3]] + ; GCN-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sreg_32 = S_MOV_B32 224 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_25]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY67:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN4]] + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_25]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY68:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY68]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY69:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_7]], [[S_LOAD_DWORDX4_IMM]], [[COPY69]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY70:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN6]] + ; GCN-NEXT: [[COPY71:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY71]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY13]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 - ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY14]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 - ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY15]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY16]], [[S_LOAD_DWORDX4_IMM]], [[COPY17]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY73:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]] + ; GCN-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sreg_32 = S_MOV_B32 120 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_26]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY74:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]] + ; GCN-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sreg_32 = S_MOV_B32 240 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_27]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY75:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_27]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY76:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY76]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_7]], [[S_LOAD_DWORDX4_IMM]], [[COPY77]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY78:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]] + ; GCN-NEXT: [[COPY79:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY79]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY80:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY80]].sub0 + ; GCN-NEXT: [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY80]].sub1 + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_5:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY81]], [[V_MOV_B32_e32_6]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_5:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY82]], killed [[V_MBCNT_LO_U32_B32_e64_5]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_5:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_6]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp30:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_SET_INACTIVE_B32_5]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_5]], killed [[V_MOV_B32_dpp30]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp31:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_30]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_30]], killed [[V_MOV_B32_dpp31]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp32:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_31]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_31]], killed [[V_MOV_B32_dpp32]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp33:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_32]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_32]], killed [[V_MOV_B32_dpp33]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp34:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_33]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_33]], killed [[V_MOV_B32_dpp34]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp35:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_34]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_34]], killed [[V_MOV_B32_dpp35]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_5:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_35]], killed [[S_MOV_B32_28]] + ; GCN-NEXT: early-clobber %35:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_5]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_5:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_5]], [[V_MOV_B32_e32_6]], implicit $exec + ; GCN-NEXT: [[SI_IF5:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_5]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.11 (%ir-block.165): + ; GCN-NEXT: successors: %bb.12(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY83:%[0-9]+]]:vgpr_32 = COPY %35 + ; GCN-NEXT: [[COPY84:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY83]], [[COPY84]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_29]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.12 (%ir-block.167): + ; GCN-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY85:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY86:%[0-9]+]]:sreg_32 = COPY [[COPY85]].sub0 + ; GCN-NEXT: [[COPY87:%[0-9]+]]:sreg_32 = COPY [[COPY85]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_6:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY86]], [[V_MOV_B32_e32_8]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_6:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY87]], killed [[V_MBCNT_LO_U32_B32_e64_6]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_6:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_8]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp36:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_SET_INACTIVE_B32_6]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_6]], killed [[V_MOV_B32_dpp36]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp37:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_36]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_36]], killed [[V_MOV_B32_dpp37]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp38:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_37]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_37]], killed [[V_MOV_B32_dpp38]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp39:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_38]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_38]], killed [[V_MOV_B32_dpp39]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp40:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_39]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_39]], killed [[V_MOV_B32_dpp40]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp41:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_40]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_40]], killed [[V_MOV_B32_dpp41]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_6:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_41]], killed [[S_MOV_B32_30]] + ; GCN-NEXT: early-clobber %37:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_6]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_6:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_6]], [[V_MOV_B32_e32_8]], implicit $exec + ; GCN-NEXT: [[SI_IF6:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_6]], %bb.14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.13 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.13 (%ir-block.193): + ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY88:%[0-9]+]]:vgpr_32 = COPY %37 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY88]], killed [[V_MOV_B32_e32_9]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_31]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.14 (%ir-block.195): + ; GCN-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF6]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY89:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY90:%[0-9]+]]:sreg_32 = COPY [[COPY89]].sub0 + ; GCN-NEXT: [[COPY91:%[0-9]+]]:sreg_32 = COPY [[COPY89]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_7:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY90]], [[V_MOV_B32_e32_10]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_7:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY91]], killed [[V_MBCNT_LO_U32_B32_e64_7]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_7:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_10]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp42:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_SET_INACTIVE_B32_7]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_7]], killed [[V_MOV_B32_dpp42]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp43:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_42]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_42]], killed [[V_MOV_B32_dpp43]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp44:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_43]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_43]], killed [[V_MOV_B32_dpp44]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp45:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_44]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_44]], killed [[V_MOV_B32_dpp45]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp46:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_45]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_45]], killed [[V_MOV_B32_dpp46]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp47:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_46]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_46]], killed [[V_MOV_B32_dpp47]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_7:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_47]], killed [[S_MOV_B32_32]] + ; GCN-NEXT: early-clobber %39:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_7]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_7:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_7]], [[V_MOV_B32_e32_10]], implicit $exec + ; GCN-NEXT: [[SI_IF7:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_7]], %bb.16, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.15 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.15 (%ir-block.221): + ; GCN-NEXT: successors: %bb.16(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY92:%[0-9]+]]:vgpr_32 = COPY %39 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY92]], killed [[V_MOV_B32_e32_11]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_33]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.16 (%ir-block.223): + ; GCN-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF7]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], killed [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_34]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY93:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_12]], [[S_LOAD_DWORDX4_IMM]], [[COPY93]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY94:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY95:%[0-9]+]]:sreg_32 = COPY [[COPY94]].sub0 + ; GCN-NEXT: [[COPY96:%[0-9]+]]:sreg_32 = COPY [[COPY94]].sub1 + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_8:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY95]], [[V_MOV_B32_e32_12]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_8:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY96]], killed [[V_MBCNT_LO_U32_B32_e64_8]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_8:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_12]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp48:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_SET_INACTIVE_B32_8]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_8]], killed [[V_MOV_B32_dpp48]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp49:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_48]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_48]], killed [[V_MOV_B32_dpp49]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp50:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_49]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_49]], killed [[V_MOV_B32_dpp50]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp51:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_50]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_50]], killed [[V_MOV_B32_dpp51]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp52:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_51]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_51]], killed [[V_MOV_B32_dpp52]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp53:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_52]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_52]], killed [[V_MOV_B32_dpp53]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_8:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_53]], killed [[S_MOV_B32_35]] + ; GCN-NEXT: early-clobber %41:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_8]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_8:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_8]], [[V_MOV_B32_e32_12]], implicit $exec + ; GCN-NEXT: [[SI_IF8:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_8]], %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.17 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.17 (%ir-block.249): + ; GCN-NEXT: successors: %bb.18(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY97:%[0-9]+]]:vgpr_32 = COPY %41 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY97]], killed [[V_MOV_B32_e32_13]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_36]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.18 (%ir-block.251): + ; GCN-NEXT: SI_END_CF [[SI_IF8]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY23]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 - ; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY24]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 - ; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY26]], [[S_LOAD_DWORDX4_IMM]], [[COPY27]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[COPY98:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY98]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sreg_32 = S_MOV_B32 136 + ; GCN-NEXT: [[COPY99:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY99]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_38]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sreg_32 = S_MOV_B32 272 + ; GCN-NEXT: [[COPY100:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY100]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_39]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_37]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_39]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY101:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY102:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY101]], [[S_LOAD_DWORDX4_IMM]], [[COPY102]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 - ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY103:%[0-9]+]]:vreg_128 = COPY [[COPY65]] + ; GCN-NEXT: [[COPY104:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY103]], [[COPY104]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[COPY105:%[0-9]+]]:vreg_128 = COPY [[COPY66]] + ; GCN-NEXT: [[COPY106:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY105]], [[COPY106]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_40]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sreg_32 = S_MOV_B32 288 + ; GCN-NEXT: [[COPY107:%[0-9]+]]:vreg_128 = COPY [[COPY67]] + ; GCN-NEXT: [[COPY108:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY107]], [[COPY108]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_41]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_41]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY109:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY110:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY109]], [[S_LOAD_DWORDX4_IMM]], [[COPY110]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY111:%[0-9]+]]:vreg_128 = COPY [[COPY70]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY111]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY33]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 - ; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 - ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY35]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[COPY37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY112:%[0-9]+]]:vreg_128 = COPY [[COPY73]] + ; GCN-NEXT: [[COPY113:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY112]], [[COPY113]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sreg_32 = S_MOV_B32 152 + ; GCN-NEXT: [[COPY114:%[0-9]+]]:vreg_128 = COPY [[COPY74]] + ; GCN-NEXT: [[COPY115:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY114]], [[COPY115]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_42]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sreg_32 = S_MOV_B32 304 + ; GCN-NEXT: [[COPY116:%[0-9]+]]:vreg_128 = COPY [[COPY75]] + ; GCN-NEXT: [[COPY117:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY116]], [[COPY117]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_43]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_43]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY118:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY119:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY118]], [[S_LOAD_DWORDX4_IMM]], [[COPY119]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY120:%[0-9]+]]:vreg_128 = COPY [[COPY78]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY120]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: S_ENDPGM 0 bb.0: %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -9,14 +9,27 @@ ; CHECK-LABEL: add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_add v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_add_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -31,14 +44,27 @@ ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_sub v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_sub_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -53,14 +79,26 @@ ; CHECK-LABEL: and: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_and v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_and_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -75,14 +113,28 @@ ; CHECK-LABEL: or: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_or v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_or_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -97,14 +149,29 @@ ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_xor v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -155,14 +222,27 @@ ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -212,14 +292,27 @@ ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB8_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -269,14 +362,28 @@ ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -326,14 +433,26 @@ ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB12_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_umin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -616,15 +735,29 @@ define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB23_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_add_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -638,15 +771,29 @@ define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB24_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_sub_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -660,16 +807,30 @@ define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB25_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_min_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -682,16 +843,30 @@ define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB26_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_max_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -704,15 +879,28 @@ define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB27_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_min_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -726,16 +914,30 @@ define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB28_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_max_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -748,15 +950,28 @@ define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB29_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -770,16 +985,30 @@ define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB30_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_or_b32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -792,15 +1021,31 @@ define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB31_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_xor v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_xor_b32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -10,16 +10,28 @@ define amdgpu_kernel void @alloc_lds_gds(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel @@ -32,18 +44,44 @@ define amdgpu_kernel void @alloc_lds_gds_align(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds_align: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:140 +; GCN-NEXT: ds_add_u32 v0, v1 offset:140 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_2: +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: ; %bb.3: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_4: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -256,11 +256,14 @@ ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations +; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -556,11 +559,14 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations +; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -864,11 +870,14 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations +; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1185,11 +1194,14 @@ ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations +; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -573,12 +573,36 @@ ret void } -; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store: -; CGN: global_store_dword -; GCN: ds_add_u32 -; GCN: s_load_dword s -; GCN-NOT: global_load_dword -; GCN: global_store_dword +; GCN-LABEL: no_alias_atomic_rmw_then_no_alias_store: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s3, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: s_mul_i32 s2, s2, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v0, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB17_2: +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dword v1, v0, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_endpgm define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) { ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store( ; CHECK-NEXT: entry: diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -7,19 +7,19 @@ ; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2 -; GCN-NEXT: s_mov_b32 s7, 0 +; GCN-NEXT: s_mov_b32 s8, 0 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 -; GCN-NEXT: s_cbranch_execz .LBB0_5 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GCN-NEXT: s_cbranch_execz .LBB0_6 ; GCN-NEXT: .LBB0_2: ; %bb ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_and_b32 s8, exec_lo, s6 -; GCN-NEXT: s_or_b32 s7, s8, s7 -; GCN-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GCN-NEXT: s_and_b32 s7, exec_lo, s6 +; GCN-NEXT: s_or_b32 s8, s7, s8 +; GCN-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -27,18 +27,37 @@ ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: s_or_saveexec_b32 s7, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s9 +; GCN-NEXT: s_mov_b32 exec_lo, s7 ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5 ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: s_or_saveexec_b32 s7, -1 +; GCN-NEXT: v_add_nc_u32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN-NEXT: v_add_nc_u32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN-NEXT: v_add_nc_u32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN-NEXT: v_add_nc_u32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN-NEXT: v_mov_b32_e32 v4, v3 +; GCN-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GCN-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GCN-NEXT: s_mov_b32 exec_lo, s7 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s7, 0, v1 +; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s7 +; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: buffer_atomic_add v0, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB0_1 -; GCN-NEXT: .LBB0_5: ; %bb5 +; GCN-NEXT: .LBB0_6: ; %bb5 ; GCN-NEXT: s_endpgm .entry: br label %bb