Index: llvm/include/llvm/CodeGen/ReachingDefAnalysis.h =================================================================== --- llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -152,10 +152,9 @@ void getReachingLocalUses(MachineInstr *MI, int PhysReg, InstSet &Uses) const; - /// Search MBB for a definition of PhysReg and insert it into Defs. If no - /// definition is found, recursively search the predecessor blocks for them. - void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs, - BlockSet &VisitedBBs) const; + /// Search MBB for a definition of PhysReg and insert it into Incoming. If no + /// definition is found, recursively search the successor blocks for them. + void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs) const; /// For the given block, collect the instructions that use the live-in /// value of the provided register. Return whether the value is still @@ -228,6 +227,11 @@ /// Provides the instruction of the closest reaching def instruction of /// PhysReg that reaches MI, relative to the begining of MI's basic block. MachineInstr *getReachingLocalMIDef(MachineInstr *MI, int PhysReg) const; + + /// Search MBB for a definition of PhysReg and insert it into Defs. If no + /// definition is found, recursively search the predecessor blocks for them. + void getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs, + BlockSet &VisitedBBs) const; }; } // namespace llvm Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -326,6 +326,13 @@ } } +void +ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg, + InstSet &Defs) const { + SmallPtrSet VisitedBBs; + return getLiveOuts(MBB, PhysReg, Defs, VisitedBBs); +} + void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs, BlockSet &VisitedBBs) const { Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -676,6 +676,10 @@ return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr; } +static inline bool isLSRImmOpcode(int Opc) { + return Opc == ARM::LSRi || Opc == ARM::tLSRri || Opc == ARM::t2LSRri; +} + /// isValidCoprocessorNumber - decide whether an explicit coprocessor /// number is legal in generic instructions like CDP. The answer can /// vary with the subtarget. Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -61,6 +61,8 @@ namespace { + using InstSet = SmallPtrSetImpl; + class PostOrderLoopTraversal { MachineLoop &ML; MachineLoopInfo &MLI; @@ -173,13 +175,27 @@ PredicatedMI *getDivergent() const { return Divergent; } }; + struct Reduction { + MachineInstr *Init; + MachineInstr &Copy; + MachineInstr &Reduce; + MachineInstr &VPSEL; + MachineInstr &VCTP; + + Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add, + MachineInstr *Sel, MachineInstr *Pred) + : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel), VCTP(*Pred) { } + }; + struct LowOverheadLoop { MachineLoop &ML; MachineLoopInfo &MLI; ReachingDefAnalysis &RDA; const TargetRegisterInfo &TRI; + const ARMBaseInstrInfo &TII; MachineFunction *MF = nullptr; + MachineBasicBlock *Preheader = nullptr; MachineInstr *InsertPt = nullptr; MachineInstr *Start = nullptr; MachineInstr *Dec = nullptr; @@ -189,13 +205,19 @@ SetVector CurrentPredicate; SmallVector VPTBlocks; SmallPtrSet ToRemove; + SmallVector, 1> Reductions; bool Revert = false; bool CannotTailPredicate = false; LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, - ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI) - : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI) { + ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, + const ARMBaseInstrInfo &TII) + : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) { MF = ML.getHeader()->getParent(); + if (auto *MBB = ML.getLoopPreheader()) + Preheader = MBB; + else if (auto *MBB = MLI.findLoopPreheader(&ML, true)) + Preheader = MBB; } // If this is an MVE instruction, check that we know how to use tail @@ -219,9 +241,17 @@ // of elements to the loop start instruction. bool ValidateTailPredicate(MachineInstr *StartInsertPt); + // See whether the live-out instructions are a reduction that we can fixup + // later. + bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers); + // Check that any values available outside of the loop will be the same // after tail predication conversion. - bool ValidateLiveOuts() const; + bool ValidateLiveOuts(); + + // Is the vpsel in the exit block predicated upon the element count in + // a way that allows it to combine values from two iterations? + MachineInstr* getMergePredicate(MachineInstr *VPSEL) const; // Is it safe to define LR with DLS/WLS? // LR can be defined if it is the operand to start, because it's the same @@ -311,6 +341,8 @@ void ConvertVPTBlocks(LowOverheadLoop &LoLoop); + void FixupReductions(LowOverheadLoop &LoLoop) const; + MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); void Expand(LowOverheadLoop &LoLoop); @@ -442,7 +474,7 @@ }; // First, find the block that looks like the preheader. - MachineBasicBlock *MBB = MLI.findLoopPreheader(&ML, true); + MachineBasicBlock *MBB = Preheader; if (!MBB) { LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); return false; @@ -518,36 +550,191 @@ return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); } -bool LowOverheadLoop::ValidateLiveOuts() const { +MachineInstr* LowOverheadLoop::getMergePredicate(MachineInstr *VPSEL) const { + unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1; + MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx); + if (!Pred || Pred->getOpcode() != VCTP->getOpcode()) + return nullptr; + + MachineInstr *ExitBlockElems = RDA.getMIOperand(Pred, 1); + if (!ExitBlockElems) + return nullptr; + + auto FirstMIUse = [this](MachineInstr *MI) -> MachineInstr* { + for (auto &MO : MI->uses()) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + return RDA.getMIOperand(MI, MO); + } + return nullptr; + }; + + auto LastMIUse = [this](MachineInstr *MI) -> MachineInstr* { + for (auto &MO : reverse(MI->uses())) { + if (!MO.isReg() || MO.getReg() == 0) + continue; + return RDA.getMIOperand(MI, MO); + } + return nullptr; + }; + + auto FirstImmUse = [](MachineInstr *MI, int64_t Imm) { + for (auto &MO : MI->uses()) { + if (!MO.isImm()) + continue; + return MO.getImm() == Imm; + } + return false; + }; + + // Check if the VCTP is using the exiting element count calculated in the + // preheader. The instructions will look like something like this, where + // X is the vector factor: + // BackedgeCount = (SUB (BIC (ADD TotalElems, X-1), X-1), X) + // TripCount = (ADD BackedgeCount, 1) + // ExitBlockElems = (SUB TotalElems, (LSR BackedgeCount, log2(X))) + + MachineInstr *TripCount = RDA.getMIOperand(Start, 0); + if (!TripCount) + return nullptr; + + if (auto *LSR = LastMIUse(ExitBlockElems)) { + if (!isLSRImmOpcode(LSR->getOpcode())) + return nullptr; + unsigned ShiftAmt = log2(getTailPredVectorWidth(VCTP->getOpcode())); + if (FirstImmUse(LSR, ShiftAmt)) + if (auto *BackedgeCount = FirstMIUse(LSR)) + if (BackedgeCount == LastMIUse(TripCount)) + return Pred; + } + + return nullptr; +} + +bool +LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) { + // Also check for reductions where the operation needs to be merging values + // from the last and previous loop iterations. This means an instruction + // producing a value and a vmov storing the value calculated in the previous + // iteration. So we can have two live-out regs, one produced by a vmov and + // both being consumed by a vpsel. + LLVM_DEBUG(dbgs() << "ARM Loops: Found loop live-outs:\n"; + for (auto *MI : LiveMIs) + dbgs() << " - " << *MI); + + // Expect a vmov, a vadd and a single vpsel user. + if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1) + return false; + + MachineInstr *VPSEL = *LiveOutUsers.begin(); + if (VPSEL->getOpcode() != ARM::MVE_VPSEL) + return false; + + MachineInstr *MergePred = getMergePredicate(VPSEL); + if (!MergePred) { + LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); + return false; + } + + MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1); + if (!Reduce) + return false; + + // TODO: Support more operations that VADD. + switch (VCTP->getOpcode()) { + default: + return false; + case ARM::MVE_VCTP32: + if (Reduce->getOpcode() != ARM::MVE_VADDi32) + return false; + break; + case ARM::MVE_VCTP16: + if (Reduce->getOpcode() != ARM::MVE_VADDi16) + return false; + break; + case ARM::MVE_VCTP8: + if (Reduce->getOpcode() != ARM::MVE_VADDi8) + return false; + break; + } + + // Check that the VORR is actually a VMOV. + MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2); + if (!Copy || Copy->getOpcode() != ARM::MVE_VORR || + !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() || + Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg()) + return false; + + assert((LiveMIs.count(Reduce) && LiveMIs.count(Copy)) && + "Expected live outs to be consumed by vpsel"); + + assert((Reduce->getOperand(0).getReg() == Reduce->getOperand(1).getReg() || + Reduce->getOperand(0).getReg() == Reduce->getOperand(2).getReg()) && + "Expected VADD to be overwriting one of its operands"); + + // Check that the vadd and vmov are only used by each other and the vpsel. + SmallPtrSet CopyUsers; + RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers); + if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) + return false; + + SmallPtrSet ReduceUsers; + RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers); + if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) + return false; + + // Then find whether there's an instruction initialising the register that + // is storing the reduction. + if (!Preheader) + return false; + + SmallPtrSet Incoming; + RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming); + if (Incoming.size() > 1) + return false; + + MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin(); + LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n" + << " - " << *Copy << " - " << *Reduce + << " - " << *MergePred << " - " << *VPSEL); + Reductions.push_back(std::make_unique(Init, Copy, Reduce, + VPSEL, MergePred)); + return true; +} + +bool LowOverheadLoop::ValidateLiveOuts() { // Collect Q-regs that are live in the exit blocks. We don't collect scalars // because they won't be affected by lane predication. const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); SmallSet LiveOuts; SmallVector ExitBlocks; ML.getExitBlocks(ExitBlocks); - for (auto *MBB : ExitBlocks) - for (const MachineBasicBlock::RegisterMaskPair &RegMask : MBB->liveins()) - if (QPRs->contains(RegMask.PhysReg)) - LiveOuts.insert(RegMask.PhysReg); + assert(ExitBlocks.size() == 1 && "Expected a single exit block"); + + SmallPtrSet LiveOutUsers; + MachineBasicBlock *MBB = ExitBlocks.front(); + for (const MachineBasicBlock::RegisterMaskPair &RegMask : MBB->liveins()) { + Register PhysReg = RegMask.PhysReg; + if (QPRs->contains(PhysReg)) { + LiveOuts.insert(PhysReg); + RDA.getLiveInUses(MBB, PhysReg, LiveOutUsers); + } + } // Collect the instructions in the loop body that define the live-out values. SmallPtrSet LiveMIs; assert(ML.getNumBlocks() == 1 && "Expected single block loop!"); - MachineBasicBlock *MBB = ML.getHeader(); - for (auto Reg : LiveOuts) + MBB = ML.getHeader(); + for (auto Reg : LiveOuts) { if (auto *MI = RDA.getLocalLiveOutMIDef(MBB, Reg)) - LiveMIs.insert(MI); + if (!isVectorPredicated(MI)) + LiveMIs.insert(MI); + } - LLVM_DEBUG(dbgs() << "ARM Loops: Found loop live-outs:\n"; - for (auto *MI : LiveMIs) - dbgs() << " - " << *MI); - // We've already validated that any VPT predication within the loop will be - // equivalent when we perform the predication transformation; so we know that - // any VPT predicated instruction is predicated upon VCTP. Any live-out - // instruction needs to be predicated, so check this here. - for (auto *MI : LiveMIs) - if (!isVectorPredicated(MI)) - return false; + // If we have any non-predicated live-outs, they need to be part of a + // reduction that we can fixup later. + if (!LiveMIs.empty() && !FindValidReduction(LiveMIs, LiveOutUsers)) + return false; // We want to find out if the tail-predicated version of this loop will // produce the same values as the loop in its original form. For this to @@ -570,6 +757,7 @@ // user(s) and not observable elsewhere. SetVector Unknowns; SmallPtrSet Knowns; + Knowns.insert(LiveMIs.begin(), LiveMIs.end()); for (auto &MI : *MBB) { if (isVectorPredicated(&MI)) { Knowns.insert(&MI); @@ -602,7 +790,7 @@ } auto HasKnownUsers = [this](MachineInstr *MI, const MachineOperand &MO, - SmallPtrSetImpl &Knowns) { + InstSet &Knowns) { SmallPtrSet Uses; RDA.getGlobalUses(MI, MO.getReg(), Uses); for (auto *Use : Uses) { @@ -757,6 +945,7 @@ return !IsUse && MI->mayLoadOrStore() ? false : true; } + bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { const ARMSubtarget &ST = static_cast(mf.getSubtarget()); if (!ST.hasLOB()) @@ -816,14 +1005,12 @@ return nullptr; }; - LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI); + LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); // Search the preheader for the start intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. - if (auto *Preheader = ML->getLoopPreheader()) - LoLoop.Start = SearchForStart(Preheader); - else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) - LoLoop.Start = SearchForStart(Preheader); + if (LoLoop.Preheader) + LoLoop.Start = SearchForStart(LoLoop.Preheader); else return false; @@ -1077,6 +1264,63 @@ return &*MIB; } +void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n"); + auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) { + MachineBasicBlock *MBB = InsertPt.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR)); + MIB.addDef(To); + MIB.addReg(From); + MIB.addReg(From); + MIB.addImm(0); + MIB.addReg(0); + MIB.addReg(To); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB); + }; + + for (auto &Reduction : LoLoop.Reductions) { + MachineInstr &Copy = Reduction->Copy; + MachineInstr &Reduce = Reduction->Reduce; + Register DestReg = Copy.getOperand(0).getReg(); + + // Change the initialiser if present + if (Reduction->Init) { + MachineInstr *Init = Reduction->Init; + + for (unsigned i = 0; i < Init->getNumOperands(); ++i) { + MachineOperand &MO = Init->getOperand(i); + if (MO.isReg() && MO.isUse() && MO.isTied() && + Init->findTiedOperandIdx(i) == 0) + Init->getOperand(i).setReg(DestReg); + } + Init->getOperand(0).setReg(DestReg); + LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init); + } else + BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg()); + + // Change the reducing op to write to the register that is used to copy + // its value on the next iteration. Also update the tied-def operand. + Reduce.getOperand(0).setReg(DestReg); + Reduce.getOperand(5).setReg(DestReg); + LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce); + + // Instead of a vpsel, just copy the register into the necessary one. + MachineInstr &VPSEL = Reduction->VPSEL; + if (VPSEL.getOperand(0).getReg() != DestReg) + BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg); + + // Remove the unnecessary instructions. + LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n" + << " - " << Copy + << " - " << VPSEL + << " - " << Reduction->VCTP); + Copy.eraseFromParent(); + VPSEL.eraseFromParent(); + Reduction->VCTP.eraseFromParent(); + } +} + void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); @@ -1203,8 +1447,10 @@ RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) + if (LoLoop.IsTailPredicationLegal()) { ConvertVPTBlocks(LoLoop); + FixupReductions(LoLoop); + } for (auto *I : LoLoop.ToRemove) { LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); I->eraseFromParent(); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -210,33 +210,25 @@ ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: add.w r4, r12, #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r4, r4, #3 ; CHECK-NEXT: subs r5, r4, #4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, r5, lsr #2 ; CHECK-NEXT: lsrs r4, r5, #2 ; CHECK-NEXT: sub.w r4, r12, r4, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.i32 eq, q1, zr +; CHECK-NEXT: vcmp.i32 eq, q1, zr +; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 ; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: le lr, .LBB2_2 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vctp.32 r4 -; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB2_4: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/constant-init-reduction.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/constant-init-reduction.mir @@ -0,0 +1,349 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc signext i16 @constant_init_sub_reduction(i8* nocapture readonly %a, i32 %N) { + entry: + %cmp8 = icmp eq i32 %N, 0 + %0 = add i32 %N, 7 + %1 = lshr i32 %0, 3 + %2 = shl nuw i32 %1, 3 + %3 = add i32 %2, -8 + %4 = lshr i32 %3, 3 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 3 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <8 x i16> [ , %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv15 = bitcast i8* %lsr.iv to <8 x i8>* + %10 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %9) + %11 = sub i32 %9, 8 + %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv15, i32 1, <8 x i1> %10, <8 x i8> undef) + %12 = zext <8 x i8> %wide.masked.load to <8 x i16> + %13 = sub <8 x i16> %vec.phi, %12 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 8 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %13, %vector.body ] + %16 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %17 = select <8 x i1> %16, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %18 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i16 [ 32767, %entry ], [ %18, %middle.block ] + ret i16 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc signext i16 @constant_init_add_reduction(i8* nocapture readonly %a, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp eq i32 %N, 0 + %0 = add i32 %N, 7 + %1 = lshr i32 %0, 3 + %2 = shl nuw i32 %1, 3 + %3 = add i32 %2, -8 + %4 = lshr i32 %3, 3 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 3 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %13, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %14, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv15 = bitcast i8* %lsr.iv to <8 x i8>* + %10 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %9) + %11 = sub i32 %9, 8 + %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv15, i32 1, <8 x i1> %10, <8 x i8> undef) + %12 = zext <8 x i8> %wide.masked.load to <8 x i16> + %13 = add <8 x i16> %vec.phi, %12 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 8 + %14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %15 = icmp ne i32 %14, 0 + br i1 %15, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %13, %vector.body ] + %16 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) + %17 = select <8 x i1> %16, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %18 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %17) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i16 [ 0, %entry ], [ %18, %middle.block ] + ret i16 %res.0.lcssa + } + + declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) + declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + +... +--- +name: constant_init_sub_reduction +alignment: 16 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: + - id: 0 + value: '<8 x i16> ' + alignment: 16 + isTargetSpecific: false +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: constant_init_sub_reduction + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: tCMPi8 renamable $r1, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = t2MOVi16 32767, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r2, dead $cpsr = tADDi3 renamable $r1, 7, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = t2BICri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = t2SUBrs renamable $r1, killed renamable $r2, 26, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r1, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRBU16_post killed renamable $r0, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv15, align 1) + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r2, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 + ; CHECK: bb.3 (align 16): + ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $lr + + tCMPi8 renamable $r1, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = t2MOVi16 32767, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tSXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r2, dead $cpsr = tADDi3 renamable $r1, 7, 14, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r2 = t2BICri killed renamable $r2, 7, 14, $noreg, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 27, 14, $noreg, $noreg + renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 3, 14, $noreg + renamable $r3 = tLEApcrel %const.0, 14, $noreg + renamable $r2 = t2SUBrs renamable $r1, killed renamable $r2, 26, 14, $noreg, $noreg + renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP16 renamable $r1, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRBU16_post killed renamable $r0, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv15, align 1) + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 8, 14, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r2 + + renamable $vpr = MVE_VCTP16 killed renamable $r2, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tSXTH killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + + bb.3 (align 16): + CONSTPOOL_ENTRY 0, %const.0, 16 + +... +--- +name: constant_init_add_reduction +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: constant_init_add_reduction + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: tCMPi8 renamable $r1, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 2, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r2, dead $cpsr = tADDi3 renamable $r1, 7, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = t2BICri killed renamable $r2, 7, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 27, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = t2SUBrs renamable $r1, killed renamable $r2, 26, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r1, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRBU16_post killed renamable $r0, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv15, align 1) + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VMOVLu8bh killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VADDi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r2, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr + ; CHECK: renamable $r0 = tSXTH killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $lr + + tCMPi8 renamable $r1, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 2, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + renamable $r0 = tSXTH killed renamable $r0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r2, dead $cpsr = tADDi3 renamable $r1, 7, 14, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r2 = t2BICri killed renamable $r2, 7, 14, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 27, 14, $noreg, $noreg + renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 3, 14, $noreg + renamable $r2 = t2SUBrs renamable $r1, killed renamable $r2, 26, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP16 renamable $r1, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRBU16_post killed renamable $r0, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv15, align 1) + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 8, 14, $noreg + renamable $q0 = MVE_VMOVLu8bh killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1, $r2 + + renamable $vpr = MVE_VCTP16 killed renamable $r2, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r7, def $lr + renamable $r0 = tSXTH killed renamable $r0, 14, $noreg + tBX_RET 14, $noreg, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/constant-reduction.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/constant-reduction.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc i32 @constant_reduction(i8* nocapture readonly %a) { + entry: + call void @llvm.set.loop.iterations.i32(i32 250) + br label %vector.body + + vector.body: ; preds = %vector.body, %entry + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %entry ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %5, %vector.body ] + %0 = phi i32 [ 250, %entry ], [ %6, %vector.body ] + %1 = phi i32 [ 999, %entry ], [ %3, %vector.body ] + %lsr.iv10 = bitcast i8* %lsr.iv to <4 x i8>* + %2 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %1) + %3 = sub i32 %1, 4 + %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv10, i32 1, <4 x i1> %2, <4 x i8> undef) + %4 = zext <4 x i8> %wide.masked.load to <4 x i32> + %5 = add <4 x i32> %vec.phi, %4 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 4 + %6 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %7 = icmp ne i32 %6, 0 + br i1 %7, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <4 x i32> [ %vec.phi, %vector.body ] + %.lcssa = phi <4 x i32> [ %5, %vector.body ] + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 3) + %9 = select <4 x i1> %8, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa + %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9) + ret i32 %10 + } + + declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) +... +--- +name: constant_reduction +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: constant_reduction + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $lr = t2MOVi 250, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r1 = t2MOVi16 999, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VMOVimmi32 255, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q2 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, killed renamable $vpr :: (load 4 from %ir.lsr.iv10, align 1) + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VAND killed renamable $q0, renamable $q1, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VADDi32 renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q2 + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q2, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $lr + + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $lr = t2MOVi 250, 14, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r1 = t2MOVi16 999, 14, $noreg + renamable $q1 = MVE_VMOVimmi32 255, 0, $noreg, undef renamable $q1 + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $q1, $r0, $r1 + + renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + $q2 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q2 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, killed renamable $vpr :: (load 4 from %ir.lsr.iv10, align 1) + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + renamable $q0 = MVE_VAND killed renamable $q0, renamable $q1, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi32 renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q2 + + renamable $r0, dead $cpsr = tMOVi8 3, 14, $noreg + renamable $vpr = MVE_VCTP32 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q2, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir @@ -270,39 +270,32 @@ ; CHECK: renamable $r0 = t2BICri killed renamable $r0, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3 = t2LSLri $r10, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r0, 4, 14 /* CC::al */, $noreg - ; CHECK: renamable $r0, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VDUP32 renamable $r7, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r0 = nuw nsw t2ADDrs killed renamable $r0, renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r1, dead $cpsr = tLSRri killed renamable $r1, 2, 14 /* CC::al */, $noreg ; CHECK: renamable $r9 = t2SUBrs $r10, killed renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg ; CHECK: bb.5.for.cond4.preheader.us: ; CHECK: successors: %bb.6(0x80000000) - ; CHECK: liveins: $lr, $q0, $r0, $r3, $r4, $r5, $r7, $r8, $r9, $r10, $r12 + ; CHECK: liveins: $lr, $q0, $q2, $r0, $r3, $r4, $r5, $r7, $r8, $r9, $r10, $r12 ; CHECK: renamable $r1 = t2LDRs renamable $r4, renamable $r7, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.arrayidx12.us) - ; CHECK: $q1 = MVE_VORR $q0, $q0, 0, $noreg, undef $q1 + ; CHECK: dead $q1 = MVE_VORR $q0, $q0, 0, $noreg, undef $q1 ; CHECK: $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg - ; CHECK: renamable $q1 = MVE_VMOV_to_lane_32 killed renamable $q1, killed renamable $r1, 0, 14 /* CC::al */, $noreg + ; CHECK: $q2 = MVE_VMOV_to_lane_32 killed $q2, killed renamable $r1, 0, 14 /* CC::al */, $noreg ; CHECK: $r6 = tMOVr $r5, 14 /* CC::al */, $noreg ; CHECK: $r1 = tMOVr $r8, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS renamable $r0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.6.vector.body: ; CHECK: successors: %bb.6(0x7c000000), %bb.7(0x04000000) - ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: $q2 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q2 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r6, renamable $q1 = MVE_VLDRHS32_post killed renamable $r6, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1012, align 2) - ; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv46, align 2) - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $q2, $r0, $r1, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r12 + ; CHECK: renamable $r6, renamable $q1 = MVE_VLDRHS32_post killed renamable $r6, 8, 0, $noreg :: (load 8 from %ir.lsr.iv1012, align 2) + ; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv46, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q2, 0, $noreg, undef renamable $q1 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.6 + ; CHECK: $q2 = MVE_VADDi32 renamable $q1, killed renamable $q2, 0, $noreg, undef $q2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.6 ; CHECK: bb.7.middle.block: ; CHECK: successors: %bb.8(0x04000000), %bb.5(0x7c000000) ; CHECK: liveins: $q0, $q1, $q2, $r0, $r3, $r4, $r5, $r7, $r8, $r9, $r10, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r9, 0, $noreg ; CHECK: renamable $r5 = tADDhirr killed renamable $r5, renamable $r3, 14 /* CC::al */, $noreg - ; CHECK: renamable $q1 = MVE_VPSEL killed renamable $q1, killed renamable $q2, 0, killed renamable $vpr + ; CHECK: $q1 = MVE_VORR $q2, $q2, 0, $noreg, killed $q1 ; CHECK: $lr = tMOVr $r10, 14 /* CC::al */, $noreg ; CHECK: renamable $r2 = MVE_VADDVu32no_acc killed renamable $q1, 0, $noreg ; CHECK: t2STRs killed renamable $r2, renamable $r4, renamable $r7, 2, 14 /* CC::al */, $noreg :: (store 4 into %ir.27) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-reductions.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-reductions.mir @@ -0,0 +1,275 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc i32 @nested_reduction(i16** nocapture readonly %a, i16* nocapture readonly %b, i32 %N, i32 %M) { + entry: + %cmp23 = icmp eq i32 %N, 0 + %cmp220 = icmp eq i32 %M, 0 + %or.cond = or i1 %cmp23, %cmp220 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader + + for.cond1.preheader.us.preheader: ; preds = %entry + %n.rnd.up = add i32 %M, 3 + %n.vec = and i32 %n.rnd.up, -4 + %0 = add i32 %n.vec, -4 + %1 = lshr i32 %0, 2 + %2 = add nuw nsw i32 %1, 1 + %3 = shl i32 %1, 2 + %4 = sub i32 %M, %3 + br label %for.cond1.preheader.us + + for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %middle.block + %i.025.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] + %res.024.us = phi i32 [ %19, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ] + %arrayidx.us = getelementptr inbounds i16*, i16** %a, i32 %i.025.us + %5 = load i16*, i16** %arrayidx.us, align 4 + %6 = insertelement <4 x i32> , i32 %res.024.us, i32 0 + call void @llvm.set.loop.iterations.i32(i32 %2) + br label %vector.body + + vector.body: ; preds = %vector.body, %for.cond1.preheader.us + %lsr.iv38 = phi i16* [ %scevgep39, %vector.body ], [ %b, %for.cond1.preheader.us ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %5, %for.cond1.preheader.us ] + %vec.phi = phi <4 x i32> [ %6, %for.cond1.preheader.us ], [ %14, %vector.body ] + %7 = phi i32 [ %2, %for.cond1.preheader.us ], [ %15, %vector.body ] + %8 = phi i32 [ %M, %for.cond1.preheader.us ], [ %10, %vector.body ] + %lsr.iv3840 = bitcast i16* %lsr.iv38 to <4 x i16>* + %lsr.iv37 = bitcast i16* %lsr.iv to <4 x i16>* + %9 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %8) + %10 = sub i32 %8, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv37, i32 2, <4 x i1> %9, <4 x i16> undef) + %11 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load32 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv3840, i32 2, <4 x i1> %9, <4 x i16> undef) + %12 = sext <4 x i16> %wide.masked.load32 to <4 x i32> + %13 = add <4 x i32> %vec.phi, %11 + %14 = sub <4 x i32> %13, %12 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep39 = getelementptr i16, i16* %lsr.iv38, i32 4 + %15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %7, i32 1) + %16 = icmp ne i32 %15, 0 + br i1 %16, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <4 x i32> [ %vec.phi, %vector.body ] + %.lcssa = phi <4 x i32> [ %14, %vector.body ] + %17 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %4) + %18 = select <4 x i1> %17, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa + %19 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %18) + %inc9.us = add nuw i32 %i.025.us, 1 + %exitcond29 = icmp eq i32 %inc9.us, %N + br i1 %exitcond29, label %for.cond.cleanup, label %for.cond1.preheader.us + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %19, %middle.block ] + ret i32 %res.0.lcssa + } + + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + +... +--- +name: nested_reduction +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 32 + offsetAdjustment: -24 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r9', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r8', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: nested_reduction + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.5(0x55555555), %bb.1(0x2aaaaaab) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $lr, $r8, $r9, $r10 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 20 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20 + ; CHECK: $r7 = frame-setup tADDrSPi $sp, 3, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r8, killed $r9, killed $r10 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -24 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -28 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -32 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2IT 1, 8, implicit-def $itstate + ; CHECK: tCMPi8 renamable $r3, 0, 1 /* CC::ne */, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + ; CHECK: tBcc %bb.5, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.1.for.cond1.preheader.us.preheader: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: $r8 = tMOVr killed $r1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r1, dead $cpsr = tADDi3 renamable $r3, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r1 = t2BICri killed renamable $r1, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r6, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r5, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VDUP32 renamable $r5, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r10 = nuw nsw t2ADDrs killed renamable $r6, renamable $r1, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r1, dead $cpsr = tLSRri killed renamable $r1, 2, 14 /* CC::al */, $noreg + ; CHECK: renamable $r9 = t2SUBrs renamable $r3, killed renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg + ; CHECK: bb.2.for.cond1.preheader.us: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $q0, $r0, $r2, $r3, $r5, $r8, $r9, $r10, $r12 + ; CHECK: renamable $r6 = t2LDRs renamable $r0, renamable $r5, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.arrayidx.us) + ; CHECK: $q2 = MVE_VORR $q0, $q0, 0, $noreg, undef $q2 + ; CHECK: renamable $q2 = MVE_VMOV_to_lane_32 killed renamable $q2, killed renamable $r12, 0, 14 /* CC::al */, $noreg + ; CHECK: $r1 = tMOVr $r8, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS renamable $r10 + ; CHECK: $r4 = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: bb.3.vector.body: + ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) + ; CHECK: liveins: $lr, $q0, $q2, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r4, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q2, killed $q2, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHU32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv3840, align 2) + ; CHECK: renamable $r6, renamable $q3 = MVE_VLDRHU32_post killed renamable $r6, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv37, align 2) + ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed renamable $r4, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q3 = MVE_VADDi32 renamable $q1, killed renamable $q3, 0, $noreg, undef renamable $q3 + ; CHECK: renamable $q2 = MVE_VSUBi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3 + ; CHECK: bb.4.middle.block: + ; CHECK: successors: %bb.5(0x04000000), %bb.2(0x7c000000) + ; CHECK: liveins: $q0, $q1, $q2, $r0, $r2, $r3, $r5, $r8, $r9, $r10 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r9, 0, $noreg + ; CHECK: renamable $r5, dead $cpsr = nuw tADDi8 killed renamable $r5, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VPSEL killed renamable $q2, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: tCMPr renamable $r5, renamable $r2, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: renamable $r12 = MVE_VADDVu32no_acc killed renamable $q1, 0, $noreg + ; CHECK: tBcc %bb.2, 1 /* CC::ne */, killed $cpsr + ; CHECK: bb.5.for.cond.cleanup: + ; CHECK: liveins: $r12 + ; CHECK: $r0 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r8, def $r9, def $r10 + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.5(0x80000000), %bb.1(0x40000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $lr, $r8, $r9, $r10 + + frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 20 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r6, -12 + frame-setup CFI_INSTRUCTION offset $r5, -16 + frame-setup CFI_INSTRUCTION offset $r4, -20 + $r7 = frame-setup tADDrSPi $sp, 3, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r8, killed $r9, killed $r10 + frame-setup CFI_INSTRUCTION offset $r10, -24 + frame-setup CFI_INSTRUCTION offset $r9, -28 + frame-setup CFI_INSTRUCTION offset $r8, -32 + tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr + renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + t2IT 1, 8, implicit-def $itstate + tCMPi8 renamable $r3, 0, 1, killed $cpsr, implicit-def $cpsr, implicit killed $itstate + tBcc %bb.5, 0, killed $cpsr + + bb.1.for.cond1.preheader.us.preheader: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3 + + $r8 = tMOVr killed $r1, 14, $noreg + renamable $r1, dead $cpsr = tADDi3 renamable $r3, 3, 14, $noreg + renamable $r1 = t2BICri killed renamable $r1, 3, 14, $noreg, $noreg + renamable $r6, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + renamable $r5, dead $cpsr = tMOVi8 0, 14, $noreg + renamable $q0 = MVE_VDUP32 renamable $r5, 0, $noreg, undef renamable $q0 + renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + renamable $r10 = nuw nsw t2ADDrs killed renamable $r6, renamable $r1, 19, 14, $noreg, $noreg + renamable $r1, dead $cpsr = tLSRri killed renamable $r1, 2, 14, $noreg + renamable $r9 = t2SUBrs renamable $r3, killed renamable $r1, 18, 14, $noreg, $noreg + + bb.2.for.cond1.preheader.us: + successors: %bb.3(0x80000000) + liveins: $q0, $r0, $r2, $r3, $r5, $r8, $r9, $r10, $r12 + + renamable $r6 = t2LDRs renamable $r0, renamable $r5, 2, 14, $noreg :: (load 4 from %ir.arrayidx.us) + $q2 = MVE_VORR $q0, $q0, 0, $noreg, undef $q2 + renamable $q2 = MVE_VMOV_to_lane_32 killed renamable $q2, killed renamable $r12, 0, 14, $noreg + $r1 = tMOVr $r8, 14, $noreg + $lr = tMOVr $r10, 14, $noreg + $r4 = tMOVr $r3, 14, $noreg + t2DoLoopStart renamable $r10 + + bb.3.vector.body: + successors: %bb.3(0x7c000000), %bb.4(0x04000000) + liveins: $lr, $q0, $q2, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10 + + renamable $vpr = MVE_VCTP32 renamable $r4, 0, $noreg + $q1 = MVE_VORR killed $q2, $q2, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q2 = MVE_VLDRHU32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv3840, align 2) + renamable $r6, renamable $q3 = MVE_VLDRHU32_post killed renamable $r6, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv37, align 2) + renamable $r4, dead $cpsr = tSUBi8 killed renamable $r4, 4, 14, $noreg + renamable $q3 = MVE_VADDi32 renamable $q1, killed renamable $q3, 0, $noreg, undef renamable $q3 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q2 = MVE_VSUBi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 + t2LoopEnd renamable $lr, %bb.3, implicit-def dead $cpsr + tB %bb.4, 14, $noreg + + bb.4.middle.block: + successors: %bb.5(0x04000000), %bb.2(0x7c000000) + liveins: $q0, $q1, $q2, $r0, $r2, $r3, $r5, $r8, $r9, $r10 + + renamable $vpr = MVE_VCTP32 renamable $r9, 0, $noreg + renamable $r5, dead $cpsr = nuw tADDi8 killed renamable $r5, 1, 14, $noreg + renamable $q1 = MVE_VPSEL killed renamable $q2, killed renamable $q1, 0, killed renamable $vpr + tCMPr renamable $r5, renamable $r2, 14, $noreg, implicit-def $cpsr + renamable $r12 = MVE_VADDVu32no_acc killed renamable $q1, 0, $noreg + tBcc %bb.2, 1, killed $cpsr + + bb.5.for.cond.cleanup: + liveins: $r12 + + $r0 = tMOVr killed $r12, 14, $noreg + $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r8, def $r9, def $r10 + tPOP_RET 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-8-16.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-8-16.mir @@ -0,0 +1,592 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc zeroext i8 @test_add_reduce_8(i8* nocapture readonly %a, i8* nocapture readonly %b, i8* nocapture readonly %c) { + entry: + call void @llvm.set.loop.iterations.i32(i32 32) + br label %vector.body + + vector.body: ; preds = %vector.body, %entry + %lsr.iv24 = phi i8* [ %scevgep25, %vector.body ], [ %c, %entry ] + %lsr.iv21 = phi i8* [ %scevgep22, %vector.body ], [ %b, %entry ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %entry ] + %vec.phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %6, %vector.body ] + %0 = phi i32 [ 32, %entry ], [ %7, %vector.body ] + %1 = phi i32 [ 499, %entry ], [ %3, %vector.body ] + %lsr.iv2426 = bitcast i8* %lsr.iv24 to <16 x i8>* + %lsr.iv2123 = bitcast i8* %lsr.iv21 to <16 x i8>* + %lsr.iv20 = bitcast i8* %lsr.iv to <16 x i8>* + %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %1) + %3 = sub i32 %1, 16 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv20, i32 1, <16 x i1> %2, <16 x i8> undef) + %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2123, i32 1, <16 x i1> %2, <16 x i8> undef) + %4 = mul <16 x i8> %wide.masked.load16, %wide.masked.load + %wide.masked.load17 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2426, i32 1, <16 x i1> %2, <16 x i8> undef) + %5 = add <16 x i8> %wide.masked.load17, %vec.phi + %6 = add <16 x i8> %5, %4 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep22 = getelementptr i8, i8* %lsr.iv21, i32 16 + %scevgep25 = getelementptr i8, i8* %lsr.iv24, i32 16 + %7 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %8 = icmp ne i32 %7, 0 + br i1 %8, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <16 x i8> [ %vec.phi, %vector.body ] + %.lcssa = phi <16 x i8> [ %6, %vector.body ] + %9 = call <16 x i1> @llvm.arm.mve.vctp8(i32 3) + %10 = select <16 x i1> %9, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa + %11 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %10) + ret i8 %11 + } + + define dso_local arm_aapcs_vfpcc zeroext i8 @test_sub_reduce_8(i8* nocapture readonly %a, i8* nocapture readonly %b, i8* nocapture readonly %c) { + entry: + call void @llvm.set.loop.iterations.i32(i32 32) + br label %vector.body + + vector.body: ; preds = %vector.body, %entry + %lsr.iv23 = phi i8* [ %scevgep24, %vector.body ], [ %c, %entry ] + %lsr.iv20 = phi i8* [ %scevgep21, %vector.body ], [ %b, %entry ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %entry ] + %vec.phi = phi <16 x i8> [ , %entry ], [ %6, %vector.body ] + %0 = phi i32 [ 32, %entry ], [ %7, %vector.body ] + %1 = phi i32 [ 499, %entry ], [ %3, %vector.body ] + %lsr.iv2325 = bitcast i8* %lsr.iv23 to <16 x i8>* + %lsr.iv2022 = bitcast i8* %lsr.iv20 to <16 x i8>* + %lsr.iv19 = bitcast i8* %lsr.iv to <16 x i8>* + %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %1) + %3 = sub i32 %1, 16 + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv19, i32 1, <16 x i1> %2, <16 x i8> undef) + %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2022, i32 1, <16 x i1> %2, <16 x i8> undef) + %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load + %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv2325, i32 1, <16 x i1> %2, <16 x i8> undef) + %5 = sub <16 x i8> %vec.phi, %wide.masked.load16 + %6 = sub <16 x i8> %5, %4 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 + %scevgep21 = getelementptr i8, i8* %lsr.iv20, i32 16 + %scevgep24 = getelementptr i8, i8* %lsr.iv23, i32 16 + %7 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %8 = icmp ne i32 %7, 0 + br i1 %8, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <16 x i8> [ %vec.phi, %vector.body ] + %.lcssa = phi <16 x i8> [ %6, %vector.body ] + %9 = call <16 x i1> @llvm.arm.mve.vctp8(i32 3) + %10 = select <16 x i1> %9, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa + %11 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %10) + ret i8 %11 + } + + define dso_local arm_aapcs_vfpcc zeroext i16 @test_add_reduce_16(i16* nocapture readonly %a, i16* nocapture readonly %b) { + entry: + call void @llvm.set.loop.iterations.i32(i32 38) + br label %vector.body + + vector.body: ; preds = %vector.body, %entry + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %entry ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %entry ] + %vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %5, %vector.body ] + %0 = phi i32 [ 38, %entry ], [ %6, %vector.body ] + %1 = phi i32 [ 299, %entry ], [ %3, %vector.body ] + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <8 x i16>* + %lsr.iv17 = bitcast i16* %lsr.iv to <8 x i16>* + %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %1) + %3 = sub i32 %1, 8 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %2, <8 x i16> undef) + %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1820, i32 2, <8 x i1> %2, <8 x i16> undef) + %4 = and <8 x i16> %wide.masked.load14, %wide.masked.load + %5 = add <8 x i16> %4, %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 8 + %6 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %7 = icmp ne i32 %6, 0 + br i1 %7, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %5, %vector.body ] + %8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 3) + %9 = select <8 x i1> %8, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %9) + ret i16 %10 + } + + define dso_local arm_aapcs_vfpcc zeroext i16 @test_sub_reduce_16(i16* nocapture readonly %a, i16* nocapture readonly %b) { + entry: + call void @llvm.set.loop.iterations.i32(i32 38) + br label %vector.body + + vector.body: ; preds = %vector.body, %entry + %lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %entry ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %entry ] + %vec.phi = phi <8 x i16> [ , %entry ], [ %5, %vector.body ] + %0 = phi i32 [ 38, %entry ], [ %6, %vector.body ] + %1 = phi i32 [ 299, %entry ], [ %3, %vector.body ] + %lsr.iv1820 = bitcast i16* %lsr.iv18 to <8 x i16>* + %lsr.iv17 = bitcast i16* %lsr.iv to <8 x i16>* + %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %1) + %3 = sub i32 %1, 8 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %2, <8 x i16> undef) + %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1820, i32 2, <8 x i1> %2, <8 x i16> undef) + %4 = and <8 x i16> %wide.masked.load14, %wide.masked.load + %5 = sub <8 x i16> %vec.phi, %4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 8 + %6 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %7 = icmp ne i32 %6, 0 + br i1 %7, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ] + %.lcssa = phi <8 x i16> [ %5, %vector.body ] + %8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 3) + %9 = select <8 x i1> %8, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa + %10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %9) + ret i16 %10 + } + + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <16 x i1> @llvm.arm.mve.vctp8(i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + +... +--- +name: test_add_reduce_8 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_add_reduce_8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $lr = t2MOVi 32, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r3 = t2MOVi16 499, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv20, align 1) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2123, align 1) + ; CHECK: renamable $r2, renamable $q3 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv2426, align 1) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VMULi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q2 = MVE_VADDi8 killed renamable $q3, renamable $q0, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q1 = MVE_VADDi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1 + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP8 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $lr = t2MOVi 32, 14, $noreg, $noreg + renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + renamable $r3 = t2MOVi16 499, 14, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q1, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + MVE_VPST 2, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv20, align 1) + renamable $r1, renamable $q2 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2123, align 1) + renamable $r2, renamable $q3 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv2426, align 1) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg + renamable $q1 = MVE_VMULi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $q2 = MVE_VADDi8 killed renamable $q3, renamable $q0, 0, $noreg, undef renamable $q2 + renamable $q1 = MVE_VADDi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1 + + renamable $r0, dead $cpsr = tMOVi8 3, 14, $noreg + renamable $vpr = MVE_VCTP8 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: test_sub_reduce_8 +alignment: 16 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: + - id: 0 + value: '<16 x i8> ' + alignment: 16 + isTargetSpecific: false +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_sub_reduce_8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2MOVi 32, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q1 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) + ; CHECK: renamable $r3 = t2MOVi16 499, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv19, align 1) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2022, align 1) + ; CHECK: renamable $r2, renamable $q3 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv2325, align 1) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VMULi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q2 = MVE_VSUBi8 renamable $q0, killed renamable $q3, 0, $noreg, undef renamable $q2 + ; CHECK: renamable $q1 = MVE_VSUBi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1 + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP8 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + ; CHECK: renamable $r0 = tUXTB killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + ; CHECK: bb.3 (align 16): + ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3 = tLEApcrel %const.0, 14, $noreg + renamable $lr = t2MOVi 32, 14, $noreg, $noreg + renamable $q1 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool) + renamable $r3 = t2MOVi16 499, 14, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q1, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + MVE_VPST 2, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRBU8_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv19, align 1) + renamable $r1, renamable $q2 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv2022, align 1) + renamable $r2, renamable $q3 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv2325, align 1) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg + renamable $q1 = MVE_VMULi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $q2 = MVE_VSUBi8 renamable $q0, killed renamable $q3, 0, $noreg, undef renamable $q2 + renamable $q1 = MVE_VSUBi8 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1 + + renamable $r0, dead $cpsr = tMOVi8 3, 14, $noreg + renamable $vpr = MVE_VCTP8 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu8no_acc killed renamable $q0, 0, $noreg + renamable $r0 = tUXTB killed renamable $r0, 14, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + + bb.3 (align 16): + CONSTPOOL_ENTRY 0, %const.0, 16 + +... +--- +name: test_add_reduce_16 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_add_reduce_16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $lr = t2MOVi 38, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r2 = t2MOVi16 299, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1820, align 2) + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VAND killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VADDi16 killed renamable $q0, renamable $q1, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1 + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $lr + + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $lr = t2MOVi 38, 14, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r2 = t2MOVi16 299, 14, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1820, align 2) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $q0 = MVE_VAND killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi16 killed renamable $q0, renamable $q1, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1 + + renamable $r0, dead $cpsr = tMOVi8 3, 14, $noreg + renamable $vpr = MVE_VCTP16 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: test_sub_reduce_16 +alignment: 16 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: + - id: 0 + value: '<8 x i16> ' + alignment: 16 + isTargetSpecific: false +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_sub_reduce_16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r2 = tLEApcrel %const.0, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2MOVi 38, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) + ; CHECK: renamable $r2 = t2MOVi16 299, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + ; CHECK: $q1 = MVE_VORR killed $q0, killed $q0, 0, $noreg, undef $q1 + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1820, align 2) + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VAND killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.middle.block: + ; CHECK: liveins: $q0, $q1 + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP16 killed renamable $r0, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + ; CHECK: renamable $r0 = tUXTH killed renamable $r0, 14 /* CC::al */, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + ; CHECK: bb.3 (align 16): + ; CHECK: CONSTPOOL_ENTRY 0, %const.0, 16 + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $lr + + frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r2 = tLEApcrel %const.0, 14, $noreg + renamable $lr = t2MOVi 38, 14, $noreg, $noreg + renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) + renamable $r2 = t2MOVi16 299, 14, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP16 renamable $r2, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv17, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1820, align 2) + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 8, 14, $noreg + renamable $q0 = MVE_VAND killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VSUBi16 renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.middle.block: + liveins: $q0, $q1 + + renamable $r0, dead $cpsr = tMOVi8 3, 14, $noreg + renamable $vpr = MVE_VCTP16 killed renamable $r0, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu16no_acc killed renamable $q0, 0, $noreg + renamable $r0 = tUXTH killed renamable $r0, 14, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + + bb.3 (align 16): + CONSTPOOL_ENTRY 0, %const.0, 16 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/two-reducing-loops.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/two-reducing-loops.mir @@ -0,0 +1,304 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +--- | + define dso_local arm_aapcs_vfpcc i32 @two_reducing_loops(i16* nocapture readonly %a, i16* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) { + entry: + %cmp28 = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp28, label %for.cond.cleanup7, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + %6 = shl i32 %4, 2 + %7 = sub i32 %N, %6 + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv66 = phi i16* [ %scevgep67, %vector.body ], [ %b, %vector.ph ] + %lsr.iv63 = phi i16* [ %scevgep64, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %15, %vector.body ] + %8 = phi i32 [ %5, %vector.ph ], [ %16, %vector.body ] + %9 = phi i32 [ %N, %vector.ph ], [ %11, %vector.body ] + %lsr.iv6668 = bitcast i16* %lsr.iv66 to <4 x i16>* + %lsr.iv6365 = bitcast i16* %lsr.iv63 to <4 x i16>* + %10 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %9) + %11 = sub i32 %9, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6365, i32 2, <4 x i1> %10, <4 x i16> undef) + %12 = zext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load36 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6668, i32 2, <4 x i1> %10, <4 x i16> undef) + %13 = zext <4 x i16> %wide.masked.load36 to <4 x i32> + %14 = mul nuw nsw <4 x i32> %13, %12 + %15 = add <4 x i32> %14, %vec.phi + %scevgep64 = getelementptr i16, i16* %lsr.iv63, i32 4 + %scevgep67 = getelementptr i16, i16* %lsr.iv66, i32 4 + %16 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %8, i32 1) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %vec.phi.lcssa = phi <4 x i32> [ %vec.phi, %vector.body ] + %.lcssa70 = phi <4 x i32> [ %15, %vector.body ] + %18 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %19 = icmp eq i32 %N, 0 + %20 = select <4 x i1> %18, <4 x i32> %.lcssa70, <4 x i32> %vec.phi.lcssa + %21 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %20) + %22 = add i32 %N, 3 + %23 = lshr i32 %22, 2 + %24 = shl nuw i32 %23, 2 + %25 = add i32 %24, -4 + %26 = lshr i32 %25, 2 + %27 = add nuw nsw i32 %26, 1 + br i1 %19, label %for.cond.cleanup7, label %vector.ph40 + + vector.ph40: ; preds = %middle.block + %28 = insertelement <4 x i32> , i32 %21, i32 0 + call void @llvm.set.loop.iterations.i32(i32 %27) + %29 = shl i32 %26, 2 + %30 = sub i32 %N, %29 + br label %vector.body39 + + vector.body39: ; preds = %vector.body39, %vector.ph40 + %lsr.iv = phi i8* [ %scevgep, %vector.body39 ], [ %c, %vector.ph40 ] + %vec.phi51 = phi <4 x i32> [ %28, %vector.ph40 ], [ %36, %vector.body39 ] + %31 = phi i32 [ %27, %vector.ph40 ], [ %37, %vector.body39 ] + %32 = phi i32 [ %N, %vector.ph40 ], [ %34, %vector.body39 ] + %lsr.iv62 = bitcast i8* %lsr.iv to <4 x i8>* + %33 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %32) + %34 = sub i32 %32, 4 + %wide.masked.load54 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv62, i32 1, <4 x i1> %33, <4 x i8> undef) + %35 = zext <4 x i8> %wide.masked.load54 to <4 x i32> + %36 = sub <4 x i32> %vec.phi51, %35 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 4 + %37 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %31, i32 1) + %38 = icmp ne i32 %37, 0 + br i1 %38, label %vector.body39, label %middle.block37 + + middle.block37: ; preds = %vector.body39 + %vec.phi51.lcssa = phi <4 x i32> [ %vec.phi51, %vector.body39 ] + %.lcssa = phi <4 x i32> [ %36, %vector.body39 ] + %39 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %30) + %40 = select <4 x i1> %39, <4 x i32> %.lcssa, <4 x i32> %vec.phi51.lcssa + %41 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %40) + br label %for.cond.cleanup7 + + for.cond.cleanup7: ; preds = %middle.block37, %entry, %middle.block + %res.1.lcssa = phi i32 [ %21, %middle.block ], [ 0, %entry ], [ %41, %middle.block37 ] + ret i32 %res.1.lcssa + } + + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #1 + declare void @llvm.set.loop.iterations.i32(i32) #3 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 +... +--- +name: two_reducing_loops +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 16 + offsetAdjustment: -8 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: two_reducing_loops + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.8(0x30000000), %bb.1(0x50000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16 + ; CHECK: dead $r7 = frame-setup tADDrSPi $sp, 2, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + ; CHECK: tCMPi8 renamable $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.8, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r5, dead $cpsr = tADDi3 renamable $r3, 3, 14 /* CC::al */, $noreg + ; CHECK: $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef $q1 + ; CHECK: renamable $r5 = t2BICri killed renamable $r5, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r4, dead $cpsr = tSUBi3 killed renamable $r5, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r5, dead $cpsr = tLSRri renamable $r4, 2, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBrs renamable $r3, killed renamable $r5, 18, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r5 = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r5 + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2, $r3, $r4, $r12 + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRHU32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv6365, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHU32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv6668, align 2) + ; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef $q1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: successors: %bb.7(0x30000000), %bb.4(0x50000000) + ; CHECK: liveins: $q0, $q1, $r2, $r3, $r4, $r12 + ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, killed $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: tCBZ $r3, %bb.7 + ; CHECK: bb.4.vector.ph40: + ; CHECK: successors: %bb.5(0x80000000) + ; CHECK: liveins: $r0, $r2, $r3, $r4, $r12 + ; CHECK: renamable $r1, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r1, killed renamable $r4, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r1, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 255, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r1, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: renamable $q1 = MVE_VMOV_to_lane_32 killed renamable $q1, killed renamable $r0, 0, 14 /* CC::al */, $noreg + ; CHECK: bb.5.vector.body39: + ; CHECK: successors: %bb.5(0x7c000000), %bb.6(0x04000000) + ; CHECK: liveins: $lr, $q0, $q1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: $q2 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q2 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRBU32_post killed renamable $r2, 4, 1, killed renamable $vpr :: (load 4 from %ir.lsr.iv62, align 1) + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VAND killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q1 = MVE_VSUBi32 renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.5 + ; CHECK: bb.6.middle.block37: + ; CHECK: successors: %bb.7(0x80000000) + ; CHECK: liveins: $q1, $q2, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r12, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q2, 0, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: bb.7.for.cond.cleanup7: + ; CHECK: liveins: $r0 + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + ; CHECK: bb.8: + ; CHECK: renamable $r0, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.8(0x30000000), %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $lr + + frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r5, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr + tBcc %bb.8, 0, killed $cpsr + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3 + + renamable $r5, dead $cpsr = tADDi3 renamable $r3, 3, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r5 = t2BICri killed renamable $r5, 3, 14, $noreg, $noreg + renamable $r4, dead $cpsr = tSUBi3 killed renamable $r5, 4, 14, $noreg + renamable $r5, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r5, renamable $r4, 19, 14, $noreg, $noreg + renamable $r5, dead $cpsr = tLSRri renamable $r4, 2, 14, $noreg + renamable $r12 = t2SUBrs renamable $r3, killed renamable $r5, 18, 14, $noreg, $noreg + $r5 = tMOVr $r3, 14, $noreg + t2DoLoopStart renamable $lr + + bb.2.vector.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r5, 0, $noreg + $q1 = MVE_VORR killed $q0, $q0, 0, $noreg, undef $q1 + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRHU32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv6365, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHU32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv6668, align 2) + renamable $r5, dead $cpsr = tSUBi8 killed renamable $r5, 4, 14, $noreg + renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q1, 0, $noreg, undef renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg + + bb.3.middle.block: + successors: %bb.7(0x30000000), %bb.4(0x50000000) + liveins: $q0, $q1, $r2, $r3, $r4, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q0, killed renamable $q1, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + tCBZ $r3, %bb.7 + + bb.4.vector.ph40: + successors: %bb.5(0x80000000) + liveins: $r0, $r2, $r3, $r4, $r12 + + renamable $r1, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r1, killed renamable $r4, 19, 14, $noreg, $noreg + renamable $r1, dead $cpsr = tMOVi8 0, 14, $noreg + renamable $q0 = MVE_VMOVimmi32 255, 0, $noreg, undef renamable $q0 + renamable $q1 = MVE_VDUP32 killed renamable $r1, 0, $noreg, undef renamable $q1 + t2DoLoopStart renamable $lr + renamable $q1 = MVE_VMOV_to_lane_32 killed renamable $q1, killed renamable $r0, 0, 14, $noreg + + bb.5.vector.body39: + successors: %bb.5(0x7c000000), %bb.6(0x04000000) + liveins: $lr, $q0, $q1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $q2 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q2 + MVE_VPST 8, implicit $vpr + renamable $r2, renamable $q1 = MVE_VLDRBU32_post killed renamable $r2, 4, 1, killed renamable $vpr :: (load 4 from %ir.lsr.iv62, align 1) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $q1 = MVE_VAND killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q1 = MVE_VSUBi32 renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + t2LoopEnd renamable $lr, %bb.5, implicit-def dead $cpsr + tB %bb.6, 14, $noreg + + bb.6.middle.block37: + successors: %bb.7(0x80000000) + liveins: $q1, $q2, $r12 + + renamable $vpr = MVE_VCTP32 killed renamable $r12, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q2, 0, killed renamable $vpr + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + + bb.7.for.cond.cleanup7: + liveins: $r0 + + tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + + bb.8: + renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg + tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + +... Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -10,28 +10,21 @@ ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: lsr.w r3, r12, #2 ; CHECK-NEXT: sub.w r3, r2, r3, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -84,26 +77,19 @@ ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r1, r2, #3 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: lsrs r1, r1, #2 ; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -152,26 +138,19 @@ ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r1, r2, #3 -; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: lsrs r1, r1, #2 ; CHECK-NEXT: sub.w r1, r2, r1, lsl #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: