Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -638,13 +638,14 @@ DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const; - /// \return True is LSR should make efforts to create/preserve post-inc - /// addressing mode expressions. - bool shouldFavorPostInc() const; + enum AddressingModeKind { + AMK_PreIndexed, + AMK_PostIndexed, + AMK_None + }; - /// Return true if LSR should make efforts to generate indexed addressing - /// modes that operate across loop iterations. - bool shouldFavorBackedgeIndex(const Loop *L) const; + /// Return the preferred addressing mode LSR should make efforts to generate. + AddressingModeKind getAddressingMode(const Loop *L) const; /// Return true if the target supports masked store. bool isLegalMaskedStore(Type *DataType, Align Alignment) const; @@ -1459,8 +1460,7 @@ virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) = 0; - virtual bool shouldFavorPostInc() const = 0; - virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0; + virtual AddressingModeKind getAddressingMode(const Loop *L) const = 0; virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; @@ -1803,9 +1803,8 @@ TargetLibraryInfo *LibInfo) override { return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo); } - bool shouldFavorPostInc() const override { return Impl.shouldFavorPostInc(); } - bool shouldFavorBackedgeIndex(const Loop *L) const override { - return Impl.shouldFavorBackedgeIndex(L); + AddressingModeKind getAddressingMode(const Loop *L) const override { + return Impl.getAddressingMode(L); } bool isLegalMaskedStore(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedStore(DataType, Alignment); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -209,9 +209,9 @@ return false; } - bool shouldFavorPostInc() const { return false; } - - bool shouldFavorBackedgeIndex(const Loop *L) const { return false; } + TTI::AddressingModeKind getAddressingMode(const Loop *L) const { + return TTI::AMK_None; + } bool isLegalMaskedStore(Type *DataType, Align Alignment) const { return false; Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -410,12 +410,9 @@ return TTIImpl->canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo); } -bool TargetTransformInfo::shouldFavorPostInc() const { - return TTIImpl->shouldFavorPostInc(); -} - -bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const { - return TTIImpl->shouldFavorBackedgeIndex(L); +TTI::AddressingModeKind +TargetTransformInfo::getAddressingMode(const Loop *L) const { + return TTIImpl->getAddressingMode(L); } bool TargetTransformInfo::isLegalMaskedStore(Type *DataType, Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -103,8 +103,7 @@ bool enableInterleavedAccessVectorization() { return true; } - bool shouldFavorBackedgeIndex(const Loop *L) const; - bool shouldFavorPostInc() const; + TTI::AddressingModeKind getAddressingMode(const Loop *L) const; /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -100,18 +100,18 @@ return MatchExact && MatchSubset; } -bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { +TTI::AddressingModeKind ARMTTIImpl::getAddressingMode(const Loop *L) const { if (L->getHeader()->getParent()->hasOptSize()) - return false; - if (ST->hasMVEIntegerOps()) - return false; - return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; -} + return TTI::AMK_None; -bool ARMTTIImpl::shouldFavorPostInc() const { - if (ST->hasMVEIntegerOps()) - return true; - return false; + if (!ST->hasMVEIntegerOps() && ST->isMClass() && ST->isThumb2() && + L->getNumBlocks() == 1) + return TTI::AMK_PreIndexed; + + if (ST->hasMVEIntegerOps() || ST->hasMVEFloatOps()) + return TTI::AMK_PostIndexed; + + return TTI::AMK_None; } Optional Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -67,7 +67,7 @@ TTI::PeelingPreferences &PP); /// Bias LSR towards creating post-increment opportunities. - bool shouldFavorPostInc() const; + AddressingModeKind getAddressingMode(const Loop *L) const; // L1 cache prefetch. unsigned getPrefetchDistance() const override; Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -80,8 +80,8 @@ } } -bool HexagonTTIImpl::shouldFavorPostInc() const { - return true; +AddressingModeKind::getAddressingMode(const Loop *L) const { + return AMK_PostIndexed; } /// --- Vector TTI begin --- Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1227,13 +1227,15 @@ /// Tally up interesting quantities from the given register. void Cost::RateRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl &Regs) { + TTI::AddressingModeKind AMK = TTI->getAddressingMode(L); + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { // If this is an addrec for another loop, it should be an invariant // with respect to L since L is the innermost loop (at least // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. - if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc()) + if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed) return; // It is bad to allow LSR for current loop to add induction variables @@ -1254,13 +1256,13 @@ // If the step size matches the base offset, we could use pre-indexed // addressing. - if (TTI->shouldFavorBackedgeIndex(L)) { + if (AMK == TTI::AMK_PreIndexed) { if (auto *Step = dyn_cast(AR->getStepRecurrence(*SE))) if (Step->getAPInt() == F.BaseOffset) LoopCost = 0; } - if (TTI->shouldFavorPostInc()) { + if (AMK == TTI::AMK_PostIndexed) { const SCEV *LoopStep = AR->getStepRecurrence(*SE); if (isa(LoopStep)) { const SCEV *LoopStart = AR->getStart(); @@ -3575,7 +3577,8 @@ // may generate a post-increment operator. The reason is that the // reassociations cause extra base+register formula to be created, // and possibly chosen, but the post-increment is more efficient. - if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE)) + TTI::AddressingModeKind AMK = TTI.getAddressingMode(L); + if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE)) return; SmallVector AddOps; const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); @@ -4239,7 +4242,7 @@ NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { - if (TTI.shouldFavorPostInc() && + if (TTI.getAddressingMode(this->L) == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) @@ -4679,7 +4682,7 @@ /// If we are over the complexity limit, filter out any post-inc prefering /// variables to only post-inc values. void LSRInstance::NarrowSearchSpaceByFilterPostInc() { - if (!TTI.shouldFavorPostInc()) + if (TTI.getAddressingMode(L) != TTI::AMK_PostIndexed) return; if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; @@ -4978,7 +4981,8 @@ // This can sometimes (notably when trying to favour postinc) lead to // sub-optimial decisions. There it is best left to the cost modelling to // get correct. - if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) { + if (TTI.getAddressingMode(L) != TTI::AMK_PostIndexed || + LU.Kind != LSRUse::Address) { int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); for (const SCEV *Reg : ReqRegs) { if ((F.ScaledReg && F.ScaledReg == Reg) || @@ -5560,7 +5564,7 @@ TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU) : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L), MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing && - TTI.shouldFavorBackedgeIndex(L)) { + TTI.getAddressingMode(L) == TTI::AMK_PreIndexed) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -388,82 +388,83 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB3_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB3_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #56] -; CHECK-NEXT: add.w r0, r1, r3, lsl #1 +; CHECK-NEXT: ldr.w r9, [sp, #60] +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: ldr r7, [sp, #52] +; CHECK-NEXT: add.w r0, r9, #7 +; CHECK-NEXT: add.w r11, r9, r9, lsl #1 +; CHECK-NEXT: lsl.w r3, r9, #1 +; CHECK-NEXT: lsrs r0, r0, #3 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r0, r1, r3 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r3, #7 -; CHECK-NEXT: lsr.w r11, r0, #3 ; CHECK-NEXT: .LBB3_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #68] -; CHECK-NEXT: ldr.w r10, [r0, r9, lsl #2] -; CHECK-NEXT: subs.w r0, r11, r11 +; CHECK-NEXT: ldr r0, [sp, #64] +; CHECK-NEXT: ldr.w r8, [r0, r10, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: subs r0, r0, r0 ; CHECK-NEXT: ble .LBB3_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #64] -; CHECK-NEXT: mov r6, r10 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: mla r5, r10, r9, r1 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldrd r5, r0, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: mov r8, r10 -; CHECK-NEXT: mla r7, r9, r3, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB3_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB3_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrb.s16 q0, [r0], #8 +; CHECK-NEXT: add.w r2, r1, r11 +; CHECK-NEXT: vadd.i16 q1, q0, r7 ; CHECK-NEXT: vldrb.s16 q0, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r4 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r3], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r4, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r2] +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: add r1, r9 +; CHECK-NEXT: vadd.i16 q1, q1, r7 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vldrb.s16 q1, [r2] +; CHECK-NEXT: vadd.i16 q1, q1, r7 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1] +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: vadd.i16 q1, q1, r7 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: b .LBB3_7 ; CHECK-NEXT: .LBB3_6: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: mov r12, r10 -; CHECK-NEXT: mov r8, r10 -; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov r6, r8 ; CHECK-NEXT: .LBB3_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB3_3 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #72] -; CHECK-NEXT: add.w r0, r8, r12 +; CHECK-NEXT: add.w r0, r12, r4 +; CHECK-NEXT: ldr r1, [sp, #68] ; CHECK-NEXT: add r0, r6 -; CHECK-NEXT: add r0, r10 -; CHECK-NEXT: strb.w r0, [r1, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: cmp r9, r2 +; CHECK-NEXT: add r0, r8 +; CHECK-NEXT: strb.w r0, [r1, r10] +; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r10, r0 ; CHECK-NEXT: bne .LBB3_3 ; CHECK-NEXT: .LBB3_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #72] -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: ldr r0, [sp, #68] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -870,87 +871,83 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 -; CHECK-NEXT: add.w r12, sp, #12 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill +; CHECK-NEXT: stm.w sp, {r0, r1, r2} @ 12-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #92] -; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #76] -; CHECK-NEXT: add.w r0, r1, r2, lsl #1 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r0, r1, r2 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r2, r2, lsl #1 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r2, #7 -; CHECK-NEXT: lsrs r1, r0, #3 +; CHECK-NEXT: ldr.w r10, [sp, #84] +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: ldr r7, [sp, #68] +; CHECK-NEXT: add.w r0, r10, #7 +; CHECK-NEXT: add.w r11, r10, r10, lsl #1 +; CHECK-NEXT: lsl.w r8, r10, #1 +; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #96] -; CHECK-NEXT: cmp r1, r1 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldr.w r10, [r0, r9, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #88] +; CHECK-NEXT: cmp r2, r2 +; CHECK-NEXT: strd r6, r2, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: ldr.w r0, [r0, r6, lsl #2] ; CHECK-NEXT: bge .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #92] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r6, r10 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r12, r10 -; CHECK-NEXT: mla r3, r9, r2, r0 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: mov r8, r10 -; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: mla r5, r6, r10, r1 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r2, r1 +; CHECK-NEXT: dlstp.16 lr, r10 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r4 -; CHECK-NEXT: vldrb.s16 q0, [r3], #8 -; CHECK-NEXT: vmlava.s16 r10, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vldrb.s16 q0, [r2], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r7 +; CHECK-NEXT: vldrb.s16 q0, [r5], #8 +; CHECK-NEXT: vmlava.s16 r4, q0, q1 +; CHECK-NEXT: add.w r3, r1, r11 +; CHECK-NEXT: vldrb.s16 q1, [r3] +; CHECK-NEXT: vadd.i16 q1, q1, r7 +; CHECK-NEXT: vmlava.s16 r0, q0, q1 +; CHECK-NEXT: add.w r3, r1, r8 +; CHECK-NEXT: vldrb.s16 q1, [r3] +; CHECK-NEXT: vadd.i16 q1, q1, r7 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: add r1, r10 +; CHECK-NEXT: vldrb.s16 q1, [r1] +; CHECK-NEXT: vadd.i16 q1, q1, r7 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r4 -; CHECK-NEXT: vmlava.s16 r8, q0, q1 +; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r8, r10 -; CHECK-NEXT: mov r12, r10 -; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r0, r12, r8 -; CHECK-NEXT: ldr r1, [sp, #100] -; CHECK-NEXT: add r0, r6 -; CHECK-NEXT: add r0, r10 -; CHECK-NEXT: strb.w r0, [r1, r9] -; CHECK-NEXT: add.w r9, r9, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r0 +; CHECK-NEXT: add.w r1, r12, r4 +; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add r1, r6 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: ldr r1, [sp, #92] +; CHECK-NEXT: strb r0, [r1, r6] +; CHECK-NEXT: adds r6, #1 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #100] -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4