diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -39,6 +39,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -111,10 +112,10 @@ /// intrinsic. E.g., check that the loop induction variable and the element /// count are of the form we expect, and also perform overflow checks for /// the new expressions that are created. - bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); + const SCEV *IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *Start); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -198,8 +199,8 @@ // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount // 3) The IV must be an induction phi with an increment equal to the // vector width. -bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { +const SCEV *MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, + Value *TripCount) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; @@ -207,7 +208,7 @@ Value *ElemCount = ActiveLaneMask->getOperand(1); bool Changed = false; if (!L->makeLoopInvariant(ElemCount, Changed)) - return false; + return nullptr; auto *EC= SE->getSCEV(ElemCount); auto *TC = SE->getSCEV(TripCount); @@ -215,7 +216,7 @@ cast(ActiveLaneMask->getType())->getNumElements(); if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) - return false; + return nullptr; ConstantInt *ConstElemCount = nullptr; // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to @@ -223,7 +224,38 @@ // processed by the loop, so we will refer to that from this point on. if (!SE->isLoopInvariant(EC, L)) { LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); - return false; + return nullptr; + } + + // 2) Find out if IV is an induction phi. Note that we can't use Loop + // helpers here to get the induction variable, because the hardware loop is + // no longer in loopsimplify form, and also the hwloop intrinsic uses a + // different counter. Using SCEV, we check that the induction is of the + // form i = i + 4, where the increment must be equal to the VectorWidth. + auto *IV = ActiveLaneMask->getOperand(0); + auto *IVExpr = SE->getSCEV(IV); + auto *AddExpr = dyn_cast(IVExpr); + + if (!AddExpr) { + LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); + return nullptr; + } + // Check that this AddRec is associated with this loop. + if (AddExpr->getLoop() != L) { + LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); + return nullptr; + } + auto *Step = dyn_cast(AddExpr->getOperand(1)); + if (!Step) { + LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; + AddExpr->getOperand(1)->dump()); + return nullptr; + } + auto StepValue = Step->getValue()->getSExtValue(); + if (VectorWidth != StepValue) { + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue + << " doesn't match vector width " << VectorWidth << "\n"); + return nullptr; } if ((ConstElemCount = dyn_cast(ElemCount))) { @@ -231,7 +263,7 @@ if (!TC) { LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " "set.loop.iterations\n"); - return false; + return nullptr; } // Calculate 2 tripcount values and check that they are consistent with @@ -249,10 +281,10 @@ LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " << TC1 << " from set.loop.iterations, and " << TC2 << " from get.active.lane.mask\n"); - return false; + return nullptr; } } else if (!ForceTailPredication) { - // 2) We need to prove that the sub expression that we create in the + // 3) We need to prove that the sub expression that we create in the // tail-predicated loop body, which calculates the remaining elements to be // processed, is non-negative, i.e. it doesn't overflow: // @@ -266,6 +298,7 @@ // auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); // ElementCount + (VW-1): + auto *Start = AddExpr->getStart(); auto *ECPlusVWMinus1 = SE->getAddExpr(EC, SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); @@ -274,18 +307,20 @@ // Prevent unused variable warnings with TC (void)TC; - LLVM_DEBUG( + LLVM_DEBUG({ dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; - dbgs() << "ARM TP: - TripCount = "; TC->dump(); - dbgs() << "ARM TP: - ElemCount = "; EC->dump(); + dbgs() << "ARM TP: - TripCount = " << *TC << "\n"; + dbgs() << "ARM TP: - ElemCount = " << *EC << "\n"; + dbgs() << "ARM TP: - Start = " << *Start << "\n"; + dbgs() << "ARM TP: - BETC = " << *SE->getBackedgeTakenCount(L) << "\n"; dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; - dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); - ); + dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = " << *Ceil << "\n"; + }); // As an example, almost all the tripcount expressions (produced by the // vectoriser) look like this: // - // TC = ((-4 + (4 * ((3 + %N) /u 4))) /u 4) + // TC = ((-4 + (4 * ((3 + %N) /u 4)) - start) /u 4) // // and "ElementCount + (VW-1) / VW": // @@ -294,64 +329,56 @@ // Check for equality of TC and Ceil by calculating SCEV expression // TC - Ceil and test it for zero. // - const SCEV *Sub = - SE->getMinusSCEV(SE->getBackedgeTakenCount(L), - SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), - SE->getNegativeSCEV(VW)), - VW)); + const SCEV *Div = SE->getUDivExpr( + SE->getAddExpr(SE->getMulExpr(Ceil, VW), SE->getNegativeSCEV(VW), + SE->getNegativeSCEV(Start)), + VW); + const SCEV *Sub = SE->getMinusSCEV(SE->getBackedgeTakenCount(L), Div); + LLVM_DEBUG(dbgs() << "ARM TP: - Sub = "; Sub->dump()); // Use context sensitive facts about the path to the loop to refine. This // comes up as the backedge taken count can incorporate context sensitive // reasoning, and our RHS just above doesn't. Sub = SE->applyLoopGuards(Sub, L); + LLVM_DEBUG(dbgs() << "ARM TP: - (Guarded) = "; Sub->dump()); if (!Sub->isZero()) { LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); - return false; + return nullptr; } } - // 3) Find out if IV is an induction phi. Note that we can't use Loop - // helpers here to get the induction variable, because the hardware loop is - // no longer in loopsimplify form, and also the hwloop intrinsic uses a - // different counter. Using SCEV, we check that the induction is of the - // form i = i + 4, where the increment must be equal to the VectorWidth. - auto *IV = ActiveLaneMask->getOperand(0); - auto *IVExpr = SE->getSCEV(IV); - auto *AddExpr = dyn_cast(IVExpr); - - if (!AddExpr) { - LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); - return false; + // Check that the start value is a multiple of the VectorWidth. + // TODO: This could do with a method to check if the scev is a multiple of + // VectorWidth. For the moment we just check for constants, muls and unknowns + // (which use MaskedValueIsZero and seems to be the most common). + if (auto *BaseC = dyn_cast(AddExpr->getStart())) { + if (BaseC->getAPInt().urem(VectorWidth) == 0) + return SE->getMinusSCEV(EC, BaseC); + } else if (auto *BaseV = dyn_cast(AddExpr->getStart())) { + Type *Ty = BaseV->getType(); + APInt Mask = APInt::getLowBitsSet(Ty->getPrimitiveSizeInBits(), + Log2_64(VectorWidth)); + if (MaskedValueIsZero(BaseV->getValue(), Mask, + L->getHeader()->getModule()->getDataLayout())) + return SE->getMinusSCEV(EC, BaseV); + } else if (auto *BaseMul = dyn_cast(AddExpr->getStart())) { + if (auto *BaseC = dyn_cast(BaseMul->getOperand(0))) + if (BaseC->getAPInt().urem(VectorWidth) == 0) + return SE->getMinusSCEV(EC, BaseC); + if (auto *BaseC = dyn_cast(BaseMul->getOperand(1))) + if (BaseC->getAPInt().urem(VectorWidth) == 0) + return SE->getMinusSCEV(EC, BaseC); } - // Check that this AddRec is associated with this loop. - if (AddExpr->getLoop() != L) { - LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); - return false; - } - auto *Base = dyn_cast(AddExpr->getOperand(0)); - if (!Base || !Base->isZero()) { - LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); - return false; - } - auto *Step = dyn_cast(AddExpr->getOperand(1)); - if (!Step) { - LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; - AddExpr->getOperand(1)->dump()); - return false; - } - auto StepValue = Step->getValue()->getSExtValue(); - if (VectorWidth == StepValue) - return true; - - LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue - << " doesn't match vector width " << VectorWidth << "\n"); - return false; + LLVM_DEBUG( + dbgs() << "ARM TP: induction base is not know to be a multiple of VF: " + << *AddExpr->getOperand(0) << "\n"); + return nullptr; } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { + Value *Start) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); @@ -361,7 +388,7 @@ // Insert a phi to count the number of elements processed by the loop. Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); + Processed->addIncoming(Start, L->getLoopPreheader()); // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and // thus represent the effect of tail predication. @@ -407,12 +434,19 @@ LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { + const SCEV *StartSCEV = IsSafeActiveMask(ActiveLaneMask, TripCount); + if (!StartSCEV) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } - LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount); + LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP. Start is " << *StartSCEV + << "\n"); + SCEVExpander Expander(*SE, L->getHeader()->getModule()->getDataLayout(), + "start"); + Instruction *Ins = L->getLoopPreheader()->getTerminator(); + Value *Start = Expander.expandCodeFor(StartSCEV, StartSCEV->getType(), Ins); + LLVM_DEBUG(dbgs() << "ARM TP: Created start value " << *Start << "\n"); + InsertVCTPIntrinsic(ActiveLaneMask, Start); } // Remove dead instructions and now dead phis. diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -45,8 +45,8 @@ ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r8 -; ENABLED-NEXT: ble .LBB0_2 +; ENABLED-NEXT: cmp r8, r2 +; ENABLED-NEXT: bge .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 @@ -116,8 +116,8 @@ ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r8 -; NOREDUCTIONS-NEXT: ble .LBB0_2 +; NOREDUCTIONS-NEXT: cmp r8, r2 +; NOREDUCTIONS-NEXT: bge .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -118,115 +118,98 @@ ; CHECK-NEXT: .pad #12 ; CHECK-NEXT: sub sp, #12 ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: strd r0, r1, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: blt .LBB4_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #48] -; CHECK-NEXT: mov r0, r3 -; CHECK-NEXT: ldr.w r9, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: mov r10, r2 -; CHECK-NEXT: uxth.w r12, r7 -; CHECK-NEXT: adr r7, .LCPI4_0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: str r2, [sp] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #48] +; CHECK-NEXT: add.w r12, r2, #3 +; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: uxth r3, r1 ; CHECK-NEXT: b .LBB4_4 ; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB4_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrs r1, r6, #16 -; CHECK-NEXT: subs r3, #1 -; CHECK-NEXT: add.w r9, r9, #2 -; CHECK-NEXT: sub.w r10, r10, #1 -; CHECK-NEXT: strh.w r1, [r7, r11, lsl #1] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r0 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: lsrs r2, r6, #16 +; CHECK-NEXT: sub.w r12, r12, #1 +; CHECK-NEXT: add.w r11, r11, #2 +; CHECK-NEXT: sub.w r8, r8, #1 +; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1] +; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: beq .LBB4_12 ; CHECK-NEXT: .LBB4_4: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB4_11 Depth 2 -; CHECK-NEXT: cmp r2, r11 +; CHECK-NEXT: cmp r2, r10 ; CHECK-NEXT: ble .LBB4_2 ; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: sub.w r8, r2, r11 -; CHECK-NEXT: cmp.w r8, #8 +; CHECK-NEXT: sub.w r4, r2, r10 +; CHECK-NEXT: cmp r4, #8 ; CHECK-NEXT: bhs .LBB4_7 ; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB4_10 ; CHECK-NEXT: .LBB4_7: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: bic r7, r10, #7 -; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: subs r7, #8 +; CHECK-NEXT: bic r2, r8, #7 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: subs r2, #8 +; CHECK-NEXT: bic r9, r4, #7 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: add.w lr, r1, r7, lsr #3 -; CHECK-NEXT: bic r7, r8, #7 -; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: add.w lr, r7, r2, lsr #3 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q1, [r4], #16 -; CHECK-NEXT: vldrh.u16 q2, [r5], #16 -; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: vmullb.s16 q3, q2, q1 -; CHECK-NEXT: vmullt.s16 q1, q2, q1 -; CHECK-NEXT: vshl.s32 q3, r1 -; CHECK-NEXT: vshl.s32 q1, r1 -; CHECK-NEXT: vaddva.u32 r6, q3 -; CHECK-NEXT: vaddva.u32 r6, q1 +; CHECK-NEXT: vldrh.u16 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q1, [r5], #16 +; CHECK-NEXT: rsbs r7, r3, #0 +; CHECK-NEXT: vmullb.s16 q2, q1, q0 +; CHECK-NEXT: vmullt.s16 q0, q1, q0 +; CHECK-NEXT: vshl.s32 q2, r7 +; CHECK-NEXT: vshl.s32 q0, r7 +; CHECK-NEXT: vaddva.u32 r6, q2 +; CHECK-NEXT: vaddva.u32 r6, q0 ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: cmp r8, r7 +; CHECK-NEXT: cmp r4, r9 ; CHECK-NEXT: beq .LBB4_3 ; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r7, r11 -; CHECK-NEXT: bic lr, r3, #3 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: add.w r4, r1, r7, lsl #1 -; CHECK-NEXT: add.w r5, r1, r5, lsl #1 -; CHECK-NEXT: sub.w r1, lr, r7 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r0, r1, lsr #2 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r2, r9, r10 +; CHECK-NEXT: sub.w r5, r8, r9 +; CHECK-NEXT: add.w r7, r1, r9, lsl #1 +; CHECK-NEXT: add.w r2, r1, r2, lsl #1 +; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vqadd.u32 q1, q0, r7 -; CHECK-NEXT: vdup.32 q2, r8 -; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: vptt.u32 hi, q2, q1 -; CHECK-NEXT: vldrht.s32 q1, [r4], #8 -; CHECK-NEXT: vldrht.s32 q2, [r5], #8 -; CHECK-NEXT: adds r7, #4 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vshl.s32 q1, r1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vaddvat.u32 r6, q1 -; CHECK-NEXT: le lr, .LBB4_11 +; CHECK-NEXT: rsbs r4, r3, #0 +; CHECK-NEXT: vldrh.s32 q0, [r7], #8 +; CHECK-NEXT: vldrh.s32 q1, [r2], #8 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshl.s32 q0, r4 +; CHECK-NEXT: vaddva.u32 r6, q0 +; CHECK-NEXT: letp lr, .LBB4_11 ; CHECK-NEXT: b .LBB4_3 ; CHECK-NEXT: .LBB4_12: @ %for.end17 ; CHECK-NEXT: add sp, #12 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.13: -; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %conv = sext i16 %Ls to i32 %cmp31 = icmp sgt i16 %Ls, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll --- a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll +++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll @@ -4,44 +4,27 @@ define arm_aapcs_vfpcc void @start12(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: start12: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: blt .LBB0_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: adds r4, r3, #3 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: adr r5, .LCPI0_0 -; CHECK-NEXT: sub.w lr, r4, #16 -; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: subs r3, #12 ; CHECK-NEXT: adds r0, #48 ; CHECK-NEXT: adds r1, #48 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 ; CHECK-NEXT: adds r2, #48 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: movs r4, #12 -; CHECK-NEXT: vdup.32 q1, r3 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vptt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 -; CHECK-NEXT: vfmas.f32 q3, q2, r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r2], #16 -; CHECK-NEXT: le lr, .LBB0_2 -; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r2], #16 +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup @@ -211,48 +194,30 @@ define arm_aapcs_vfpcc void @startSmod4(i32 %S, ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { ; CHECK-LABEL: startSmod4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: ldr r6, [sp, #16] -; CHECK-NEXT: cmp r6, #1 -; CHECK-NEXT: blt .LBB3_3 -; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldr.w lr, [sp, #8] +; CHECK-NEXT: cmp.w lr, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: mvn r4, #12 ; CHECK-NEXT: and.w r4, r4, r0, lsl #2 -; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: add r1, r4 ; CHECK-NEXT: add r2, r4 ; CHECK-NEXT: add r3, r4 -; CHECK-NEXT: adds r4, r6, #3 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: subs r4, r4, r0 -; CHECK-NEXT: vdup.32 q1, r6 -; CHECK-NEXT: subs r4, #4 -; CHECK-NEXT: add.w lr, r5, r4, lsr #2 -; CHECK-NEXT: adr r4, .LCPI3_0 -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: sub.w r0, lr, #4 +; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r0 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vptt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 -; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 -; CHECK-NEXT: vfmas.f32 q3, q2, r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q3, [r3], #16 -; CHECK-NEXT: le lr, .LBB3_2 -; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r6, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: vldrw.u32 q0, [r2], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrw.32 q1, [r3], #16 +; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} entry: %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup