Index: llvm/lib/CodeGen/MachineLICM.cpp =================================================================== --- llvm/lib/CodeGen/MachineLICM.cpp +++ llvm/lib/CodeGen/MachineLICM.cpp @@ -110,6 +110,7 @@ "Number of instructions not hoisted due to block frequency"); namespace { + enum HoistResult { NotHoisted = 0, Hoisted = 1, CSEd = 2 }; class MachineLICMBase : public MachineFunctionPass { const TargetInstrInfo *TII = nullptr; @@ -130,13 +131,18 @@ // State that is updated as we process loops bool Changed = false; // True if a loop is changed. bool FirstInLoop = false; // True if it's the first LICM in the loop. - MachineLoop *CurLoop = nullptr; // The current loop we are working on. - MachineBasicBlock *CurPreheader = nullptr; // The preheader for CurLoop. - // Exit blocks for CurLoop. - SmallVector ExitBlocks; - bool isExitBlock(const MachineBasicBlock *MBB) const { + // Exit blocks of each Loop. + DenseMap> ExitBlockMap; + + bool isExitBlock(MachineLoop *CurLoop, const MachineBasicBlock *MBB) { + if (ExitBlockMap.contains(CurLoop)) + return is_contained(ExitBlockMap[CurLoop], MBB); + + SmallVector ExitBlocks; + CurLoop->getExitBlocks(ExitBlocks); + ExitBlockMap[CurLoop] = ExitBlocks; return is_contained(ExitBlocks, MBB); } @@ -151,8 +157,10 @@ // Register pressure on path leading from loop preheader to current BB. SmallVector, 16> BackTrace; - // For each opcode, keep a list of potential CSE instructions. - DenseMap> CSEMap; + // For each opcode per preheader, keep a list of potential CSE instructions. + DenseMap>> + CSEMap; enum { SpeculateFalse = 0, @@ -187,6 +195,7 @@ RegLimit.clear(); BackTrace.clear(); CSEMap.clear(); + ExitBlockMap.clear(); } private: @@ -200,24 +209,27 @@ : MI(mi), Def(def), FI(fi) {} }; - void HoistRegionPostRA(); + void HoistRegionPostRA(MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader); - void HoistPostRA(MachineInstr *MI, unsigned Def); + void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader); void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, BitVector &PhysRegClobbers, SmallSet &StoredFIs, - SmallVectorImpl &Candidates); + SmallVectorImpl &Candidates, + MachineLoop *CurLoop); - void AddToLiveIns(MCRegister Reg); + void AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop); - bool IsLICMCandidate(MachineInstr &I); + bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop); - bool IsLoopInvariantInst(MachineInstr &I); + bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop); - bool HasLoopPHIUse(const MachineInstr *MI) const; + bool HasLoopPHIUse(const MachineInstr *MI, MachineLoop *CurLoop); - bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, - Register Reg) const; + bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, Register Reg, + MachineLoop *CurLoop) const; bool IsCheapInstruction(MachineInstr &MI) const; @@ -226,9 +238,9 @@ void UpdateBackTraceRegPressure(const MachineInstr *MI); - bool IsProfitableToHoist(MachineInstr &MI); + bool IsProfitableToHoist(MachineInstr &MI, MachineLoop *CurLoop); - bool IsGuaranteedToExecute(MachineBasicBlock *BB); + bool IsGuaranteedToExecute(MachineBasicBlock *BB, MachineLoop *CurLoop); bool isTriviallyReMaterializable(const MachineInstr &MI) const; @@ -241,7 +253,8 @@ DenseMap &OpenChildren, const DenseMap &ParentMap); - void HoistOutOfLoop(MachineDomTreeNode *HeaderN); + void HoistOutOfLoop(MachineDomTreeNode *HeaderN, MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader); void InitRegPressure(MachineBasicBlock *BB); @@ -252,7 +265,7 @@ void UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef = false); - MachineInstr *ExtractHoistableLoad(MachineInstr *MI); + MachineInstr *ExtractHoistableLoad(MachineInstr *MI, MachineLoop *CurLoop); MachineInstr *LookForDuplicate(const MachineInstr *MI, std::vector &PrevMIs); @@ -263,13 +276,15 @@ bool MayCSE(MachineInstr *MI); - bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader); + HoistResult Hoist(MachineInstr *MI, MachineBasicBlock *Preheader, + MachineLoop *CurLoop); void InitCSEMap(MachineBasicBlock *BB); bool isTgtHotterThanSrc(MachineBasicBlock *SrcBlock, MachineBasicBlock *TgtBlock); - MachineBasicBlock *getCurPreheader(); + MachineBasicBlock *getCurPreheader(MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader); }; class MachineLICM : public MachineLICMBase { @@ -314,19 +329,6 @@ INITIALIZE_PASS_END(EarlyMachineLICM, "early-machinelicm", "Early Machine Loop Invariant Code Motion", false, false) -/// Test if the given loop is the outer-most loop that has a unique predecessor. -static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { - // Check whether this loop even has a unique predecessor. - if (!CurLoop->getLoopPredecessor()) - return false; - // Ok, now check to see if any of its outer loops do. - for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop()) - if (L->getLoopPredecessor()) - return false; - // None of them did, so this is the outermost with a unique predecessor. - return true; -} - bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -368,27 +370,17 @@ SmallVector Worklist(MLI->begin(), MLI->end()); while (!Worklist.empty()) { - CurLoop = Worklist.pop_back_val(); - CurPreheader = nullptr; - ExitBlocks.clear(); - - // If this is done before regalloc, only visit outer-most preheader-sporting - // loops. - if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) { - Worklist.append(CurLoop->begin(), CurLoop->end()); - continue; - } - - CurLoop->getExitBlocks(ExitBlocks); + MachineLoop *CurLoop = Worklist.pop_back_val(); + MachineBasicBlock *CurPreheader = nullptr; if (!PreRegAlloc) - HoistRegionPostRA(); + HoistRegionPostRA(CurLoop, CurPreheader); else { // CSEMap is initialized for loop header when the first instruction is // being hoisted. MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader()); FirstInLoop = true; - HoistOutOfLoop(N); + HoistOutOfLoop(N, CurLoop, CurPreheader); CSEMap.clear(); } } @@ -420,11 +412,11 @@ /// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. -void MachineLICMBase::ProcessMI(MachineInstr *MI, - BitVector &PhysRegDefs, +void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, BitVector &PhysRegClobbers, SmallSet &StoredFIs, - SmallVectorImpl &Candidates) { + SmallVectorImpl &Candidates, + MachineLoop *CurLoop) { bool RuledOut = false; bool HasNonInvariantUse = false; unsigned Def = 0; @@ -502,7 +494,7 @@ // operands. FIXME: Consider unfold load folding instructions. if (Def && !RuledOut) { int FI = std::numeric_limits::min(); - if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) || + if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop)) || (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI))) Candidates.push_back(CandidateInfo(MI, Def, FI)); } @@ -510,8 +502,9 @@ /// Walk the specified region of the CFG and hoist loop invariants out to the /// preheader. -void MachineLICMBase::HoistRegionPostRA() { - MachineBasicBlock *Preheader = getCurPreheader(); +void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader) { + MachineBasicBlock *Preheader = getCurPreheader(CurLoop, CurPreheader); if (!Preheader) return; @@ -544,7 +537,8 @@ SpeculationState = SpeculateUnknown; for (MachineInstr &MI : *BB) - ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates); + ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates, + CurLoop); } // Gather the registers read / clobbered by the terminator. @@ -592,14 +586,14 @@ } } if (Safe) - HoistPostRA(MI, Candidate.Def); + HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); } } } /// Add register 'Reg' to the livein sets of BBs in the current loop, and make /// sure it is not killed by any instructions in the loop. -void MachineLICMBase::AddToLiveIns(MCRegister Reg) { +void MachineLICMBase::AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop) { for (MachineBasicBlock *BB : CurLoop->getBlocks()) { if (!BB->isLiveIn(Reg)) BB->addLiveIn(Reg); @@ -616,8 +610,10 @@ /// When an instruction is found to only use loop invariant operands that is /// safe to hoist, this instruction is called to do the dirty work. -void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) { - MachineBasicBlock *Preheader = getCurPreheader(); +void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def, + MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader) { + MachineBasicBlock *Preheader = getCurPreheader(CurLoop, CurPreheader); // Now move the instructions to the predecessor, inserting it before any // terminator instructions. @@ -638,7 +634,7 @@ // Add register to livein list to all the BBs in the current loop since a // loop invariant must be kept live throughout the whole loop. This is // important to ensure later passes do not scavenge the def register. - AddToLiveIns(Def); + AddToLiveIns(Def, CurLoop); ++NumPostRAHoisted; Changed = true; @@ -646,7 +642,8 @@ /// Check if this mbb is guaranteed to execute. If not then a load from this mbb /// may not be safe to hoist. -bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) { +bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB, + MachineLoop *CurLoop) { if (SpeculationState != SpeculateUnknown) return SpeculationState == SpeculateFalse; @@ -717,8 +714,10 @@ /// specified header block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. -void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { - MachineBasicBlock *Preheader = getCurPreheader(); +void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN, + MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader) { + MachineBasicBlock *Preheader = getCurPreheader(CurLoop, CurPreheader); if (!Preheader) return; @@ -782,10 +781,29 @@ // Process the block SpeculationState = SpeculateUnknown; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { - if (!Hoist(&MI, Preheader)) + HoistResult HoistRes = HoistResult::NotHoisted; + HoistRes = Hoist(&MI, Preheader, CurLoop); + if (HoistRes == HoistResult::NotHoisted) { + // We have failed to hoist MI to outermost loop's preheader. If MI is in + // a subloop, try to hoist it to subloop's preheader. + SmallVector InnerLoopWorkList; + for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop; + L = L->getParentLoop()) + InnerLoopWorkList.push_back(L); + + while (!InnerLoopWorkList.empty()) { + MachineLoop *InnerLoop = InnerLoopWorkList.pop_back_val(); + MachineBasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); + if (InnerLoopPreheader) { + HoistRes = Hoist(&MI, InnerLoopPreheader, InnerLoop); + if (HoistRes != HoistResult::NotHoisted) + break; + } + } + } + + if (HoistRes != HoistResult::CSEd) UpdateRegPressure(&MI); - // If we have hoisted an instruction that may store, it can only be a - // constant store. } // If it's a leaf node, it's done. Traverse upwards to pop ancestors. @@ -970,7 +988,7 @@ /// Returns true if the instruction may be a suitable candidate for LICM. /// e.g. If the instruction is a call, then it's obviously not safe to hoist it. -bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) { +bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) { // Check if it's safe to move the instruction. bool DontMoveAcrossStore = true; if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) && @@ -986,7 +1004,7 @@ // from a jump table. // Stores and side effects are already checked by isSafeToMove. if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) && - !IsGuaranteedToExecute(I.getParent())) { + !IsGuaranteedToExecute(I.getParent(), CurLoop)) { LLVM_DEBUG(dbgs() << "LICM: Load not guaranteed to execute.\n"); return false; } @@ -1005,8 +1023,9 @@ } /// Returns true if the instruction is loop invariant. -bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) { - if (!IsLICMCandidate(I)) { +bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, + MachineLoop *CurLoop) { + if (!IsLICMCandidate(I, CurLoop)) { LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n"); return false; } @@ -1015,8 +1034,9 @@ /// Return true if the specified instruction is used by a phi node and hoisting /// it could cause a copy to be inserted. -bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const { - SmallVector Work(1, MI); +bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI, + MachineLoop *CurLoop) { + SmallVector Work(1, MI); do { MI = Work.pop_back_val(); for (const MachineOperand &MO : MI->all_defs()) { @@ -1033,7 +1053,7 @@ // A PHI in an exit block can cause a copy to be inserted if the PHI // has multiple predecessors in the loop with different values. // For now, approximate by rejecting all exit blocks. - if (isExitBlock(UseMI.getParent())) + if (isExitBlock(CurLoop, UseMI.getParent())) return true; continue; } @@ -1049,7 +1069,8 @@ /// Compute operand latency between a def of 'Reg' and an use in the current /// loop, return true if the target considered it high. bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, - Register Reg) const { + Register Reg, + MachineLoop *CurLoop) const { if (MRI->use_nodbg_empty(Reg)) return false; @@ -1144,7 +1165,8 @@ /// Return true if it is potentially profitable to hoist the given loop /// invariant. -bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) { +bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI, + MachineLoop *CurLoop) { if (MI.isImplicitDef()) return true; @@ -1164,7 +1186,7 @@ return true; bool CheapInstr = IsCheapInstruction(MI); - bool CreatesCopy = HasLoopPHIUse(&MI); + bool CreatesCopy = HasLoopPHIUse(&MI, CurLoop); // Don't hoist a cheap instruction if it would create a copy in the loop. if (CheapInstr && CreatesCopy) { @@ -1186,7 +1208,7 @@ Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; - if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) { + if (MO.isDef() && HasHighOperandLatency(MI, i, Reg, CurLoop)) { LLVM_DEBUG(dbgs() << "Hoist High Latency: " << MI); ++NumHighLatency; return true; @@ -1220,7 +1242,7 @@ // instruction is not guaranteed to be executed in the loop, it's best to be // conservative. if (AvoidSpeculation && - (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) { + (!IsGuaranteedToExecute(MI.getParent(), CurLoop) && !MayCSE(&MI))) { LLVM_DEBUG(dbgs() << "Won't speculate: " << MI); return false; } @@ -1239,7 +1261,8 @@ /// Unfold a load from the given machineinstr if the load itself could be /// hoisted. Return the unfolded and hoistable load, or null if the load /// couldn't be unfolded or if it wouldn't be hoistable. -MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) { +MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI, + MachineLoop *CurLoop) { // Don't unfold simple loads. if (MI->canFoldAsLoad()) return nullptr; @@ -1280,7 +1303,8 @@ MBB->insert(Pos, NewMIs[1]); // If unfolding produced a load that wasn't loop-invariant or profitable to // hoist, discard the new instructions and bail. - if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) { + if (!IsLoopInvariantInst(*NewMIs[0], CurLoop) || + !IsProfitableToHoist(*NewMIs[0], CurLoop)) { NewMIs[0]->eraseFromParent(); NewMIs[1]->eraseFromParent(); return nullptr; @@ -1304,7 +1328,7 @@ /// out of the loop. void MachineLICMBase::InitCSEMap(MachineBasicBlock *BB) { for (MachineInstr &MI : *BB) - CSEMap[MI.getOpcode()].push_back(&MI); + CSEMap[BB][MI.getOpcode()].push_back(&MI); } /// Find an instruction amount PrevMIs that is a duplicate of MI. @@ -1328,7 +1352,7 @@ DenseMap>::iterator &CI) { // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate // the undef property onto uses. - if (CI == CSEMap.end() || MI->isImplicitDef()) + if (MI->isImplicitDef()) return false; if (MachineInstr *Dup = LookForDuplicate(MI, CI->second)) { @@ -1385,20 +1409,29 @@ /// the loop. bool MachineLICMBase::MayCSE(MachineInstr *MI) { unsigned Opcode = MI->getOpcode(); - DenseMap>::iterator CI = - CSEMap.find(Opcode); - // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate - // the undef property onto uses. - if (CI == CSEMap.end() || MI->isImplicitDef()) - return false; + for (auto &Map : CSEMap) { + // Check this CSEMap's preheader dominates MI's basic block. + if (DT->dominates(Map.first, MI->getParent())) { + DenseMap>::iterator CI = + Map.second.find(Opcode); + // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate + // the undef property onto uses. + if (CI == Map.second.end() || MI->isImplicitDef()) + continue; + if (LookForDuplicate(MI, CI->second) != nullptr) + return true; + } + } - return LookForDuplicate(MI, CI->second) != nullptr; + return false; } /// When an instruction is found to use only loop invariant operands /// that are safe to hoist, this instruction is called to do the dirty work. /// It returns true if the instruction is hoisted. -bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { +HoistResult MachineLICMBase::Hoist(MachineInstr *MI, + MachineBasicBlock *Preheader, + MachineLoop *CurLoop) { MachineBasicBlock *SrcBlock = MI->getParent(); // Disable the instruction hoisting due to block hotness @@ -1406,13 +1439,15 @@ (DisableHoistingToHotterBlocks == UseBFI::PGO && HasProfileData)) && isTgtHotterThanSrc(SrcBlock, Preheader)) { ++NumNotHoistedDueToHotness; - return false; + return HoistResult::NotHoisted; } // First check whether we should hoist this instruction. - if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) { + if (!IsLoopInvariantInst(*MI, CurLoop) || + !IsProfitableToHoist(*MI, CurLoop)) { // If not, try unfolding a hoistable load. - MI = ExtractHoistableLoad(MI); - if (!MI) return false; + MI = ExtractHoistableLoad(MI, CurLoop); + if (!MI) + return HoistResult::NotHoisted; } // If we have hoisted an instruction that may store, it can only be a constant @@ -1440,9 +1475,22 @@ // Look for opportunity to CSE the hoisted instruction. unsigned Opcode = MI->getOpcode(); - DenseMap>::iterator CI = - CSEMap.find(Opcode); - if (!EliminateCSE(MI, CI)) { + bool HasCSEDone = false; + for (auto &Map : CSEMap) { + // Check this CSEMap's preheader dominates MI's basic block. + if (DT->dominates(Map.first, MI->getParent())) { + DenseMap>::iterator CI = + Map.second.find(Opcode); + if (CI != Map.second.end()) { + if (EliminateCSE(MI, CI)) { + HasCSEDone = true; + break; + } + } + } + } + + if (!HasCSEDone) { // Otherwise, splice the instruction to the preheader. Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI); @@ -1462,21 +1510,21 @@ if (!MO.isDead()) MRI->clearKillFlags(MO.getReg()); - // Add to the CSE map. - if (CI != CSEMap.end()) - CI->second.push_back(MI); - else - CSEMap[Opcode].push_back(MI); + CSEMap[Preheader][Opcode].push_back(MI); } ++NumHoisted; Changed = true; - return true; + if (HasCSEDone) + return HoistResult::CSEd; + return HoistResult::Hoisted; } /// Get the preheader for the current loop, splitting a critical edge if needed. -MachineBasicBlock *MachineLICMBase::getCurPreheader() { +MachineBasicBlock * +MachineLICMBase::getCurPreheader(MachineLoop *CurLoop, + MachineBasicBlock *CurPreheader) { // Determine the block to which to hoist instructions. If we can't find a // suitable loop predecessor, we can't do any hoisting. Index: llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -35,18 +35,18 @@ ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_5: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: dup v0.8h, w15 ; CHECK-NEXT: mov x16, x14 ; CHECK-NEXT: mov x17, x13 ; CHECK-NEXT: mov x18, x12 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: dup v0.8h, w15 ; CHECK-NEXT: ldp q1, q4, [x16, #-16] -; CHECK-NEXT: ldp q3, q2, [x17, #-32] ; CHECK-NEXT: subs x18, x18, #16 -; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: ldp q3, q2, [x17, #-32] ; CHECK-NEXT: add x16, x16, #32 +; CHECK-NEXT: ldp q6, q5, [x17] ; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h Index: llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -557,11 +557,11 @@ ; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 ; GFX908-NEXT: s_mul_i32 s0, s0, s5 ; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -571,15 +571,17 @@ ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v8, s8 ; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v5, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 @@ -599,9 +601,9 @@ ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s0 +; GFX908-NEXT: s_add_u32 s20, s20, s14 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s1 +; GFX908-NEXT: s_addc_u32 s21, s21, s15 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 @@ -620,7 +622,7 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -648,7 +650,7 @@ ; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15] +; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -663,7 +665,7 @@ ; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_mov_b64 s[0:1], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 @@ -672,7 +674,7 @@ ; GFX908-NEXT: s_addc_u32 s7, s7, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[14:15], 0 +; GFX908-NEXT: s_mov_b64 s[0:1], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -722,11 +724,11 @@ ; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 ; GFX90A-NEXT: s_mul_i32 s0, s0, s5 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 @@ -736,12 +738,14 @@ ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -760,8 +764,8 @@ ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s0 -; GFX90A-NEXT: s_addc_u32 s21, s21, s1 +; GFX90A-NEXT: s_add_u32 s20, s20, s14 +; GFX90A-NEXT: s_addc_u32 s21, s21, s15 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -781,7 +785,7 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] ; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 @@ -802,7 +806,7 @@ ; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15] +; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -817,7 +821,7 @@ ; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_mov_b64 s[0:1], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 @@ -826,7 +830,7 @@ ; GFX90A-NEXT: s_addc_u32 s7, s7, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -22,16 +22,16 @@ ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s6, 8 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[16:17] -; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] +; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[18:19], 0 Index: llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -1,15 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}negated_cond: -; GCN: .LBB0_1: -; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]], -; GCN: .LBB0_3: -; GCN-NOT: v_cndmask_b32 -; GCN-NOT: v_cmp -; GCN: s_andn2_b64 vcc, exec, [[CC]] -; GCN: s_lshl_b32 s12, s12, 5 -; GCN: s_cbranch_vccz .LBB0_6 +; GCN: .LBB0_2: +; GCN: v_cndmask_b32_e64 +; GCN: v_cmp_ne_u32_e64 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { bb: br label %bb1 Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -63,18 +63,17 @@ ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v2 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[6:7], 0x0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v2 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] @@ -82,25 +81,26 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: s_mov_b64 s[74:75], 0x80 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v1 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v4 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -166,7 +166,7 @@ ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -178,24 +178,26 @@ ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_15 ; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.16: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -207,10 +209,10 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.21: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -236,7 +238,7 @@ ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -264,7 +266,7 @@ ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -350,18 +352,17 @@ ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v2 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[6:7], 0x0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v2 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] @@ -369,25 +370,26 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: s_mov_b64 s[76:77], 0x80 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v1 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v4 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -453,7 +455,7 @@ ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -465,24 +467,26 @@ ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.16: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -494,10 +498,10 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.21: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -523,7 +527,7 @@ ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -551,7 +555,7 @@ ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -447,67 +447,69 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 { ; CHECK-LABEL: arm_mat_mult_q31: ; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrd r9, r12, [sp, #120] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: ldrd r9, r12, [sp, #128] ; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w r7, r6, r7, lsr #1 -; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: bic r7, r7, #3 -; CHECK-NEXT: vshl.i32 q3, q1, #3 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r10, r6, r7, lsr #2 -; CHECK-NEXT: adr r7, .LCPI9_0 ; CHECK-NEXT: adr r6, .LCPI9_1 -; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r7, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vdup.32 q0, r9 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vshl.i32 q3, q0, #3 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: mul r11, r8, r9 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: mul r7, r8, r12 +; CHECK-NEXT: mul lr, r8, r12 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mul r6, r8, r9 +; CHECK-NEXT: vdup.32 q4, lr +; CHECK-NEXT: vshl.i32 q4, q4, #2 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vdup.32 q5, r7 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vshl.i32 q5, q5, #2 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov q7, q2 ; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vadd.i32 q5, q5, q0 -; CHECK-NEXT: vmlas.i32 q6, q2, r5 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmlas.i32 q7, q0, r7 +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q7, q6, q3 -; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [q5, #32]! -; CHECK-NEXT: vmul.i32 q0, q0, q6 -; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vadd.i32 q4, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q7, q3 +; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] +; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! +; CHECK-NEXT: vmul.i32 q1, q1, q7 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vadd.i32 q5, q1, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 -; CHECK-NEXT: add.w r4, r5, r11 -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: vaddv.u32 r6, q4 -; CHECK-NEXT: cmp r5, r9 -; CHECK-NEXT: str.w r6, [r2, r4, lsl #2] +; CHECK-NEXT: adds r5, r7, r6 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: vaddv.u32 r4, q5 +; CHECK-NEXT: cmp r7, r9 +; CHECK-NEXT: str.w r4, [r2, r5, lsl #2] ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 @@ -515,10 +517,9 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.7: ; CHECK-NEXT: .LCPI9_0: @@ -859,18 +860,18 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrd r2, r7, [sp, #104] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: ldrd r2, r7, [sp, #136] ; CHECK-NEXT: add.w r8, r7, #10 ; CHECK-NEXT: adr r7, .LCPI11_0 -; CHECK-NEXT: ldr r1, [sp, #96] +; CHECK-NEXT: ldr r1, [sp, #128] ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov.w r9, #6 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov.w r10, #6 ; CHECK-NEXT: movs r6, #11 ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: movs r5, #0 @@ -880,7 +881,7 @@ ; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 @@ -888,46 +889,49 @@ ; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vdup.32 q2, r9 +; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB11_3: @ %for.body27.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ => This Loop Header: Depth=3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: mov.w r11, #4 +; CHECK-NEXT: vdup.32 q3, r7 ; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ => This Loop Header: Depth=4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mul r4, r11, r6 -; CHECK-NEXT: vdup.32 q3, r5 -; CHECK-NEXT: vdup.32 q2, r7 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q4, r2 -; CHECK-NEXT: adds r4, #113 -; CHECK-NEXT: vadd.i32 q4, q1, r4 -; CHECK-NEXT: mov r4, r8 -; CHECK-NEXT: vmla.i32 q2, q4, r2 +; CHECK-NEXT: mul r5, r11, r6 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vadd.i32 q5, q1, r5 +; CHECK-NEXT: vmla.i32 q4, q5, r2 +; CHECK-NEXT: vldrw.u32 q5, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: adds r5, #113 +; CHECK-NEXT: vadd.i32 q6, q1, r5 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: vmla.i32 q5, q6, r2 ; CHECK-NEXT: .LBB11_5: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q6, [r0, q2] -; CHECK-NEXT: vadd.i32 q5, q2, q0 -; CHECK-NEXT: vadd.i32 q4, q3, q0 -; CHECK-NEXT: subs r4, #4 -; CHECK-NEXT: vadd.i32 q2, q6, r2 -; CHECK-NEXT: vldrb.s32 q6, [r1, q3] -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmlava.u32 r12, q2, q6 -; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vldrb.s32 q2, [r0, q5] +; CHECK-NEXT: vadd.i32 q7, q5, q0 +; CHECK-NEXT: vldrb.s32 q5, [r1, q4] +; CHECK-NEXT: vadd.i32 q6, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: subs r5, #4 +; CHECK-NEXT: vmlava.u32 r12, q2, q5 +; CHECK-NEXT: vmov q5, q7 +; CHECK-NEXT: vmov q4, q6 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 @@ -935,19 +939,19 @@ ; CHECK-NEXT: le lr, .LBB11_4 ; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i ; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3 -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: add.w r10, r10, #1 -; CHECK-NEXT: cmp r5, r2 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: adds r4, #1 +; CHECK-NEXT: cmp r7, r2 ; CHECK-NEXT: bne .LBB11_3 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2 -; CHECK-NEXT: adds r7, #1 -; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: cmp r9, r3 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i ; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1 ; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r7, [sp, #148] +; CHECK-NEXT: ldr r7, [sp, #180] ; CHECK-NEXT: adds r5, #1 ; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it eq Index: llvm/test/CodeGen/WebAssembly/reg-stackify.ll =================================================================== --- llvm/test/CodeGen/WebAssembly/reg-stackify.ll +++ llvm/test/CodeGen/WebAssembly/reg-stackify.ll @@ -471,8 +471,7 @@ ; CHECK-LABEL: multiple_defs: ; CHECK: f64.add $push[[NUM0:[0-9]+]]=, ${{[0-9]+}}, $pop{{[0-9]+}}{{$}} ; CHECK-NEXT: local.tee $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}} -; CHECK-NEXT: f64.select $push{{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}} -; CHECK: $[[NUM2]]=, +; CHECK-NEXT: f64.select ${{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}} ; NOREGS-LABEL: multiple_defs: ; NOREGS: f64.add ; NOREGS: local.tee