Index: llvm/include/llvm/CodeGen/ReachingDefAnalysis.h =================================================================== --- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -202,6 +202,11 @@ void getGlobalUses(MachineInstr *MI, int PhysReg, InstSet &Uses) const; + /// Collect all possible definitions of the value stored in PhysReg, which is + /// used by MI. + void getGlobalReachingDefs(MachineInstr *MI, int PhysReg, + InstSet &Uses) const; + /// Return whether From can be moved forwards to just before To. bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const; Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -389,6 +389,19 @@ } } +void +ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, int PhysReg, + InstSet &Defs) const { + if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) { + Defs.insert(Def); + return; + } + + SmallPtrSet Visited; + for (auto *MBB : MI->getParent()->predecessors()) + getLiveOuts(MBB, PhysReg, Defs); +} + void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs) const { SmallPtrSet VisitedBBs; Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -201,17 +201,6 @@ PredicatedMI *getDivergent() const { return Divergent; } }; - struct Reduction { - MachineInstr *Init; - MachineInstr &Copy; - MachineInstr &Reduce; - MachineInstr &VPSEL; - - Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add, - MachineInstr *Sel) - : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { } - }; - struct LowOverheadLoop { MachineLoop &ML; @@ -232,7 +221,6 @@ SetVector CurrentPredicate; SmallVector VPTBlocks; SmallPtrSet ToRemove; - SmallVector, 1> Reductions; SmallPtrSet BlockMasksToRecompute; bool Revert = false; bool CannotTailPredicate = false; @@ -270,10 +258,6 @@ // of elements to the loop start instruction. bool ValidateTailPredicate(MachineInstr *StartInsertPt); - // See whether the live-out instructions are a reduction that we can fixup - // later. - bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers); - // Check that any values available outside of the loop will be the same // after tail predication conversion. bool ValidateLiveOuts(); @@ -365,8 +349,6 @@ void ConvertVPTBlocks(LowOverheadLoop &LoLoop); - void FixupReductions(LowOverheadLoop &LoLoop) const; - MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); void Expand(LowOverheadLoop &LoLoop); @@ -447,8 +429,10 @@ } } - if (!ValidateLiveOuts()) + if (!ValidateLiveOuts()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n"); return false; + } // For tail predication, we need to provide the number of elements, instead // of the iteration count, to the loop start instruction. The number of @@ -636,7 +620,6 @@ return false; } - // Look at its register uses to see if it only can only receive zeros // into its false lanes which would then produce zeros. Also check that // the output register is also defined by an FalseLanesZero instruction @@ -649,120 +632,40 @@ if (canGenerateNonZeros(MI)) return false; + bool isPredicated = isVectorPredicated(&MI); + // Predicated loads will write zeros to the falsely predicated bytes of the + // destination register. + if (isPredicated && MI.mayLoad()) + return true; + + auto IsZeroInit = [](MachineInstr *Def) { + return !isVectorPredicated(Def) && + Def->getOpcode() == ARM::MVE_VMOVimmi32 && + Def->getOperand(1).getImm() == 0; + }; + bool AllowScalars = isHorizontalReduction(MI); for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg()) continue; if (!isRegInClass(MO, QPRs) && AllowScalars) continue; - if (auto *OpDef = RDA.getMIOperand(&MI, MO)) - if (FalseLanesZero.count(OpDef)) - continue; - return false; - } - LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); - return true; -} - -bool -LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) { - // Also check for reductions where the operation needs to be merging values - // from the last and previous loop iterations. This means an instruction - // producing a value and a vmov storing the value calculated in the previous - // iteration. So we can have two live-out regs, one produced by a vmov and - // both being consumed by a vpsel. - LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n"; - for (auto *MI : LiveMIs) - dbgs() << " - " << *MI); - - if (!Preheader) - return false; - - // Expect a vmov, a vadd and a single vpsel user. - // TODO: This means we can't currently support multiple reductions in the - // loop. - if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1) - return false; - - MachineInstr *VPSEL = *LiveOutUsers.begin(); - if (VPSEL->getOpcode() != ARM::MVE_VPSEL) - return false; - unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1; - MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx); - if (!Pred || Pred != VCTP) { - LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); - return false; - } - - MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1); - if (!Reduce) - return false; - - assert(LiveMIs.count(Reduce) && "Expected MI to be live-out"); - - // TODO: Support more operations than VADD. - switch (VCTP->getOpcode()) { - default: - return false; - case ARM::MVE_VCTP8: - if (Reduce->getOpcode() != ARM::MVE_VADDi8) - return false; - break; - case ARM::MVE_VCTP16: - if (Reduce->getOpcode() != ARM::MVE_VADDi16) - return false; - break; - case ARM::MVE_VCTP32: - if (Reduce->getOpcode() != ARM::MVE_VADDi32) + // Check that this instruction will produce zeros in its false lanes: + // - If it only consumes false lanes zero or constant 0 (vmov #0) + // - If it's predicated, it only matters that it's def register already has + // false lane zeros, so we can ignore its uses. + SmallPtrSet Defs; + RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs); + for (auto *Def : Defs) { + if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def)) + continue; + if (MO.isUse() && isPredicated) + continue; return false; - break; - } - - // Test that the reduce op is overwriting ones of its operands. - if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() && - Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n"); - return false; - } - - // Check that the VORR is actually a VMOV. - MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2); - if (!Copy || Copy->getOpcode() != ARM::MVE_VORR || - !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() || - Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg()) - return false; - - assert(LiveMIs.count(Copy) && "Expected MI to be live-out"); - - // Check that the vadd and vmov are only used by each other and the vpsel. - SmallPtrSet CopyUsers; - RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers); - if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n"); - return false; - } - - SmallPtrSet ReduceUsers; - RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers); - if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n"); - return false; + } } - - // Then find whether there's an instruction initialising the register that - // is storing the reduction. - SmallPtrSet Incoming; - RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming); - if (Incoming.size() > 1) - return false; - - MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin(); - LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n" - << " - " << *Copy - << " - " << *Reduce - << " - " << *VPSEL); - Reductions.push_back(std::make_unique(Init, Copy, Reduce, VPSEL)); + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); return true; } @@ -803,28 +706,20 @@ if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode())) continue; - // Predicated loads will write zeros to the falsely predicated bytes of the - // destination register. - if (isVectorPredicated(&MI)) { - if (MI.mayLoad()) - FalseLanesZero.insert(&MI); - Predicated.insert(&MI); - continue; - } + bool isPredicated = isVectorPredicated(&MI); + bool retainsOrReduces = + retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); - if (MI.getNumDefs() == 0) + if (isPredicated) + Predicated.insert(&MI); + if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) + FalseLanesZero.insert(&MI); + else if (MI.getNumDefs() == 0) continue; - - if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) { - // We require retaining and horizontal operations to operate upon zero'd - // false lanes to ensure the conversion doesn't change the output. - if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI)) - return false; - // Otherwise we need to evaluate this instruction later to see whether - // unknown false lanes will get masked away by their user(s). + else if (!isPredicated && retainsOrReduces) + return false; + else FalseLanesUnknown.insert(&MI); - } else if (!isHorizontalReduction(MI)) - FalseLanesZero.insert(&MI); } auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, @@ -853,48 +748,44 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); NonPredicated.insert(MI); - continue; + break; } } // Any unknown false lanes have been masked away by the user(s). - Predicated.insert(MI); + if (!NonPredicated.contains(MI)) + Predicated.insert(MI); } SmallPtrSet LiveOutMIs; - SmallPtrSet LiveOutUsers; SmallVector ExitBlocks; ML.getExitBlocks(ExitBlocks); assert(ML.getNumBlocks() == 1 && "Expected single block loop!"); assert(ExitBlocks.size() == 1 && "Expected a single exit block"); MachineBasicBlock *ExitBB = ExitBlocks.front(); for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { + // TODO: Instead of blocking predication, we could move the vctp to the exit + // block and calculate it's operand there in or the preheader. + if (RegMask.PhysReg == ARM::VPR) + return false; // Check Q-regs that are live in the exit blocks. We don't collect scalars // because they won't be affected by lane predication. - if (QPRs->contains(RegMask.PhysReg)) { + if (QPRs->contains(RegMask.PhysReg)) if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg)) LiveOutMIs.insert(MI); - RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers); - } } - // If we have any non-predicated live-outs, they need to be part of a - // reduction that we can fixup later. The reduction that the form of an - // operation that uses its previous values through a vmov and then a vpsel - // resides in the exit blocks to select the final bytes from n and n-1 - // iterations. - if (!NonPredicated.empty() && - !FindValidReduction(NonPredicated, LiveOutUsers)) - return false; - // We've already validated that any VPT predication within the loop will be // equivalent when we perform the predication transformation; so we know that // any VPT predicated instruction is predicated upon VCTP. Any live-out // instruction needs to be predicated, so check this here. The instructions // in NonPredicated have been found to be a reduction that we can ensure its // legality. - for (auto *MI : LiveOutMIs) - if (!isVectorPredicated(MI) && !NonPredicated.count(MI)) + for (auto *MI : LiveOutMIs) { + if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI); return false; + } + } return true; } @@ -1360,61 +1251,6 @@ return &*MIB; } -void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const { - LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n"); - auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) { - MachineBasicBlock *MBB = InsertPt.getParent(); - MachineInstrBuilder MIB = - BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR)); - MIB.addDef(To); - MIB.addReg(From); - MIB.addReg(From); - MIB.addImm(0); - MIB.addReg(0); - MIB.addReg(To); - LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB); - }; - - for (auto &Reduction : LoLoop.Reductions) { - MachineInstr &Copy = Reduction->Copy; - MachineInstr &Reduce = Reduction->Reduce; - Register DestReg = Copy.getOperand(0).getReg(); - - // Change the initialiser if present - if (Reduction->Init) { - MachineInstr *Init = Reduction->Init; - - for (unsigned i = 0; i < Init->getNumOperands(); ++i) { - MachineOperand &MO = Init->getOperand(i); - if (MO.isReg() && MO.isUse() && MO.isTied() && - Init->findTiedOperandIdx(i) == 0) - Init->getOperand(i).setReg(DestReg); - } - Init->getOperand(0).setReg(DestReg); - LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init); - } else - BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg()); - - // Change the reducing op to write to the register that is used to copy - // its value on the next iteration. Also update the tied-def operand. - Reduce.getOperand(0).setReg(DestReg); - Reduce.getOperand(5).setReg(DestReg); - LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce); - - // Instead of a vpsel, just copy the register into the necessary one. - MachineInstr &VPSEL = Reduction->VPSEL; - if (VPSEL.getOperand(0).getReg() != DestReg) - BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg); - - // Remove the unnecessary instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n" - << " - " << Copy - << " - " << VPSEL << "\n"); - Copy.eraseFromParent(); - VPSEL.eraseFromParent(); - } -} - void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); @@ -1568,10 +1404,8 @@ RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) { + if (LoLoop.IsTailPredicationLegal()) ConvertVPTBlocks(LoLoop); - FixupReductions(LoLoop); - } for (auto *I : LoLoop.ToRemove) { LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); I->eraseFromParent(); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir @@ -135,34 +135,27 @@ ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14 /* CC::al */, $noreg - ; CHECK: renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14 /* CC::al */, $noreg - ; CHECK: renamable $r4 = t2BICri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14 /* CC::al */, $noreg :: (dereferenceable load 2 from %ir.mask.gep9) - ; CHECK: renamable $r12 = t2SUBri killed renamable $r4, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: $vpr = VMSR_P0 $r5, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0 ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3, $r12 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 2, implicit $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr + ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8) ; CHECK: MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir @@ -115,27 +115,16 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.middle.block: ; CHECK: liveins: $q0 ; CHECK: $r0 = VMOVRS killed $s3, 14 /* CC::al */, $noreg, implicit killed $q0 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -2,8 +2,22 @@ ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) { -; CHECK-LABEL: .LBB0_1: @ %do.body.i -; CHECK: dlstp.32 lr, r1 +; CHECK-LABEL: arm_var_f32_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: .LBB0_1: @ %do.body.i +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit +; CHECK-NEXT: vmov s4, r1 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 @@ -18,6 +32,14 @@ ; CHECK-NEXT: vsub.f32 q2, q2, q1 ; CHECK-NEXT: vfma.f32 q0, q2, q2 ; CHECK-NEXT: letp lr, .LBB0_3 +; CHECK-NEXT: @ %bb.4: @ %do.end +; CHECK-NEXT: subs r0, r1, #1 +; CHECK-NEXT: vadd.f32 s0, s3, s3 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcvt.f32.u32 s2, s2 +; CHECK-NEXT: vdiv.f32 s0, s0, s2 +; CHECK-NEXT: vstr s0, [r2] +; CHECK-NEXT: pop {r4, pc} entry: br label %do.body.i Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -112,27 +112,16 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.middle.block: ; CHECK: liveins: $q0 ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -4,37 +4,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr { ; CHECK-LABEL: one_loop_add_add_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: ittt eq -; CHECK-NEXT: moveq r0, #0 -; CHECK-NEXT: uxtbeq r0, r0 -; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #15 -; CHECK-NEXT: sub.w r12, r3, #16 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #4 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: cbz r2, .LBB0_4 +; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dlstp.8 lr, r2 +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 -; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vadd.i8 q1, q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 -; CHECK-NEXT: vadd.i8 q1, q1, q2 -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vaddv.u8 r0, q0 -; CHECK-NEXT: pop.w {r7, lr} -; CHECK-NEXT: uxtb r0, r0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vldrb.u8 q2, [r0], #16 +; CHECK-NEXT: vadd.i8 q0, q2, q1 +; CHECK-NEXT: vaddv.u8 r12, q0 +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: uxtb.w r0, r12 +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: uxtb.w r0, r12 +; CHECK-NEXT: pop {r7, pc} entry: %cmp11 = icmp eq i32 %N, 0 br i1 %cmp11, label %for.cond.cleanup, label %vector.ph @@ -55,19 +43,18 @@ %i2 = getelementptr inbounds i8, i8* %b, i32 %index %i3 = bitcast i8* %i2 to <16 x i8>* %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) - %i4 = add <16 x i8> %wide.masked.load, %vec.phi - %i5 = add <16 x i8> %i4, %wide.masked.load16 + %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16 + %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi + %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5) %index.next = add i32 %index, 16 - %i6 = icmp eq i32 %index.next, %n.vec - br i1 %i6, label %middle.block, label %vector.body + %i7 = icmp eq i32 %index.next, %n.vec + br i1 %i7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi - %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry - %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ] + %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ] ret i8 %res.0.lcssa } @@ -152,16 +139,26 @@ ; CHECK-NEXT: uxtbeq r0, r0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.8 lr, r2 +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #15 +; CHECK-NEXT: sub.w r12, r3, #16 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q1, [r1], #16 -; CHECK-NEXT: vldrb.u8 q2, [r0], #16 +; CHECK-NEXT: vctp.8 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u8 q1, [r1], #16 +; CHECK-NEXT: vldrbt.u8 q2, [r0], #16 +; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: vsub.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -211,16 +208,26 @@ ; CHECK-NEXT: sxtheq r0, r0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: adds r3, r2, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: sub.w r12, r3, #8 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0], #8 -; CHECK-NEXT: vldrb.u16 q2, [r1], #8 +; CHECK-NEXT: vctp.16 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vsub.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB3_1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -272,16 +279,26 @@ ; CHECK-NEXT: uxtbeq r0, r0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.8 lr, r2 +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #15 +; CHECK-NEXT: sub.w r12, r3, #16 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q1, [r0], #16 -; CHECK-NEXT: vldrb.u8 q2, [r1], #16 +; CHECK-NEXT: vctp.8 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 +; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: vmul.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB4_1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -331,16 +348,26 @@ ; CHECK-NEXT: sxtheq r0, r0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: adds r3, r2, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: sub.w r12, r3, #8 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0], #8 -; CHECK-NEXT: vldrb.u16 q2, [r1], #8 +; CHECK-NEXT: vctp.16 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB5_1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -416,19 +443,25 @@ ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vmov.32 q0[0], r12 ; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u32 q0, [r0], #4 -; CHECK-NEXT: vldrb.u32 q2, [r1], #4 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u32 q0, [r0], #4 +; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB6_5 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB6_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block44 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup7 ; CHECK-NEXT: mov r0, r12 @@ -636,22 +669,31 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: adds r1, r3, #3 +; CHECK-NEXT: movs r2, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r2, r1, lsr #2 ; CHECK-NEXT: movw r1, :lower16:days ; CHECK-NEXT: movt r1, :upper16:days ; CHECK-NEXT: movs r2, #52 ; CHECK-NEXT: mla r1, r4, r2, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: subs r0, r3, #1 -; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB8_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB8_5 +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB8_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r4, pc} entry: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -50,25 +50,14 @@ ; CHECK-LABEL: bad: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: cmp r2, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #4 -; CHECK-NEXT: subs r3, r2, r3 -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmlava.s32 r12, q0, q1 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -53,24 +53,33 @@ ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: subs r4, r2, r6 -; ENABLED-NEXT: vmov.i32 q0, #0x0 +; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r8, r7, r0, lsr #2 +; ENABLED-NEXT: sub.w r0, r12, r6 +; ENABLED-NEXT: bic r0, r0, #3 +; ENABLED-NEXT: subs r0, #4 +; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dlstp.32 lr, r4 +; ENABLED-NEXT: dls lr, r0 ; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vldrh.s32 q1, [r0], #8 -; ENABLED-NEXT: vldrh.s32 q2, [r7], #8 +; ENABLED-NEXT: vctp.32 r4 +; ENABLED-NEXT: vmov q0, q1 +; ENABLED-NEXT: vpstt +; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 +; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 ; ENABLED-NEXT: mov lr, r8 ; ENABLED-NEXT: vmul.i32 q1, q2, q1 ; ENABLED-NEXT: sub.w r8, r8, #1 ; ENABLED-NEXT: vshl.s32 q1, r5 -; ENABLED-NEXT: vadd.i32 q0, q1, q0 -; ENABLED-NEXT: letp lr, .LBB0_6 +; ENABLED-NEXT: subs r4, #4 +; ENABLED-NEXT: vadd.i32 q1, q1, q0 +; ENABLED-NEXT: le lr, .LBB0_6 ; ENABLED-NEXT: @ %bb.7: @ %middle.block ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; ENABLED-NEXT: vpsel q0, q1, q0 ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 @@ -115,24 +124,33 @@ ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: subs r4, r2, r6 -; NOREDUCTIONS-NEXT: vmov.i32 q0, #0x0 +; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r8, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: sub.w r0, r12, r6 +; NOREDUCTIONS-NEXT: bic r0, r0, #3 +; NOREDUCTIONS-NEXT: subs r0, #4 +; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dlstp.32 lr, r4 +; NOREDUCTIONS-NEXT: dls lr, r0 ; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; NOREDUCTIONS-NEXT: vldrh.s32 q1, [r0], #8 -; NOREDUCTIONS-NEXT: vldrh.s32 q2, [r7], #8 +; NOREDUCTIONS-NEXT: vctp.32 r4 +; NOREDUCTIONS-NEXT: vmov q0, q1 +; NOREDUCTIONS-NEXT: vpstt +; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 +; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 ; NOREDUCTIONS-NEXT: mov lr, r8 ; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 ; NOREDUCTIONS-NEXT: sub.w r8, r8, #1 ; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 -; NOREDUCTIONS-NEXT: vadd.i32 q0, q1, q0 -; NOREDUCTIONS-NEXT: letp lr, .LBB0_6 +; NOREDUCTIONS-NEXT: subs r4, #4 +; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 +; NOREDUCTIONS-NEXT: le lr, .LBB0_6 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -119,28 +119,18 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -9,19 +9,28 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r3, r2, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -76,17 +85,26 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -137,17 +155,26 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -425,13 +425,8 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2WLS killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_WLSTP_32 $r2, %bb.1 ; CHECK: tB %bb.4, 14 /* CC::al */, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) @@ -441,18 +436,15 @@ ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $vpr = MVE_VCTP32 $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) - ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, killed renamable $vpr, undef renamable $q1 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: successors: %bb.4(0x80000000) ; CHECK: liveins: $q0, $q1, $r3 @@ -501,7 +493,7 @@ renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg MVE_VPST 8, implicit $vpr - renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, renamable $vpr, undef renamable $q1 + renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 1, renamable $vpr, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr tB %bb.3, 14, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -130,23 +130,21 @@ ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r3 ; CHECK: $r12 = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: dead $lr = MVE_DLSTP_32 renamable $r3 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12 = nsw t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.middle.block: ; CHECK: liveins: $q0, $q1, $r2, $r3 ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14 /* CC::al */, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -116,28 +116,18 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 ; CHECK: bb.2.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -9,19 +9,24 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r12], #16 -; CHECK-NEXT: vldrw.u32 q3, [q0, #80]! +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q3, [q0, #80]! +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q1, q3, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r7, pc} @@ -77,20 +82,26 @@ ; CHECK-NEXT: adr r0, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1, q1, uxtw #2] -; CHECK-NEXT: vldrw.u32 q4, [r4], #16 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vldrwt.u32 q4, [r4], #16 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q2, q2, q4 -; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q2, [r1, q1, uxtw #2] ; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vpsel q0, q2, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] ; CHECK-NEXT: vpop {d8, d9} @@ -144,20 +155,26 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [q0, #80]! -; CHECK-NEXT: vadd.i32 q3, q3, q1 -; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #80]! +; CHECK-NEXT: vadd.i32 q1, q3, q1 +; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vpsel q0, q1, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -9,22 +9,32 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: vidup.u32 q2, r6, #1 +; CHECK-NEXT: cmp r1, #4 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge.w r12, #4 +; CHECK-NEXT: sub.w r6, r1, r12 +; CHECK-NEXT: adds r6, #3 +; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: add.w lr, lr, r6, lsr #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: vmov.i32 q3, #0x4 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vcmp.f32 ge, q1, q4 -; CHECK-NEXT: vpstt +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpstttt +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vcmpt.f32 ge, q1, q4 ; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_1 ; CHECK-NEXT: vdup.32 q3, r1