Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -487,6 +487,8 @@ /// addressing mode expressions. bool shouldFavorPostInc() const; + bool shouldFavorBackedgeIndex(const Loop *L) const; + /// Return true if the target supports masked load/store /// AVX2 and AVX-512 targets allow masks for consecutive load and store bool isLegalMaskedStore(Type *DataType) const; @@ -1058,6 +1060,7 @@ TargetTransformInfo::LSRCost &C2) = 0; virtual bool canMacroFuseCmp() = 0; virtual bool shouldFavorPostInc() const = 0; + virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; @@ -1291,6 +1294,9 @@ bool shouldFavorPostInc() const override { return Impl.shouldFavorPostInc(); } + bool shouldFavorBackedgeIndex(const Loop *L) const override { + return Impl.shouldFavorBackedgeIndex(L); + } bool isLegalMaskedStore(Type *DataType) override { return Impl.isLegalMaskedStore(DataType); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -254,6 +254,8 @@ bool shouldFavorPostInc() const { return false; } + bool shouldFavorBackedgeIndex(const Loop *L) const { return false; } + bool isLegalMaskedStore(Type *DataType) { return false; } bool isLegalMaskedLoad(Type *DataType) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -163,6 +163,10 @@ return TTIImpl->shouldFavorPostInc(); } +bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const { + return TTIImpl->shouldFavorBackedgeIndex(L); +} + bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const { return TTIImpl->isLegalMaskedStore(DataType); } Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -94,6 +94,12 @@ bool enableInterleavedAccessVectorization() { return true; } + bool shouldFavorBackedgeIndex(const Loop *L) const { + if (L->getHeader()->getParent()->optForSize()) + return false; + return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; + } + /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// is IEEE-754 compliant, but it's not covered in this target. Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -155,6 +155,14 @@ cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale")); +static cl::opt CollapseUnrolledCode( + "lsr-collapse-unrolled", cl::Hidden, cl::init(false), + cl::desc("Narrow LSR search space by collapsing unrolled code")); + +static cl::opt EnableBackedgeIndexing( + "lsr-backedge-indexing", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of cross iteration post increments")); + static cl::opt ComplexityLimit( "lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits::max()), @@ -1053,12 +1061,12 @@ void dump() const; private: - void RateRegister(const SCEV *Reg, + void RateRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, const TargetTransformInfo &TTI); - void RatePrimaryRegister(const SCEV *Reg, + void RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1209,7 +1217,7 @@ Instruction *Fixup = nullptr); /// Tally up interesting quantities from the given register. -void Cost::RateRegister(const SCEV *Reg, +void Cost::RateRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1236,16 +1244,26 @@ } unsigned LoopCost = 1; - if (TTI.shouldFavorPostInc()) { - const SCEV *LoopStep = AR->getStepRecurrence(SE); - if (isa(LoopStep)) { - // Check if a post-indexed load/store can be used. - if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || - TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || + TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + + // If the step size matches the base offset, we could use post increment + // addressing so that the instruction then updates the pointer for its + // own use in the next iteration. + if (TTI.shouldFavorBackedgeIndex(L)) { + if (auto *Step = dyn_cast(AR->getStepRecurrence(SE))) { + if (Step->getAPInt() == F.BaseOffset) + LoopCost = 0; + } + } + + if (TTI.shouldFavorPostInc()) { + const SCEV *LoopStep = AR->getStepRecurrence(SE); + if (isa(LoopStep)) { const SCEV *LoopStart = AR->getStart(); if (!isa(LoopStart) && - SE.isLoopInvariant(LoopStart, L)) - LoopCost = 0; + SE.isLoopInvariant(LoopStart, L)) + LoopCost = 0; } } } @@ -1255,7 +1273,7 @@ // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa(AR->getOperand(1))) { if (!Regs.count(AR->getOperand(1))) { - RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI); + RateRegister(F, AR->getOperand(1), Regs, L, SE, DT, TTI); if (isLoser()) return; } @@ -1279,7 +1297,7 @@ /// Record this register in the set. If we haven't seen it before, rate /// it. Optional LoserRegs provides a way to declare any formula that refers to /// one of those regs an instant loser. -void Cost::RatePrimaryRegister(const SCEV *Reg, +void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1290,7 +1308,7 @@ return; } if (Regs.insert(Reg).second) { - RateRegister(Reg, Regs, L, SE, DT, TTI); + RateRegister(F, Reg, Regs, L, SE, DT, TTI); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); } @@ -1314,7 +1332,7 @@ Lose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1323,7 +1341,7 @@ Lose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, BaseReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1868,6 +1886,18 @@ // Returns the last UserInst in the chain. Instruction *tailUserInst() const { return Incs.back().UserInst; } + Instruction *head() { + return Incs.front().UserInst; + } + + Instruction *lastNonPHI() { + if (!isa(Incs.back().UserInst)) + return tailUserInst(); + if (Incs.size() < 2) + return head(); + return Incs[Incs.size()-2].UserInst; + } + // Returns true if IncExpr can be profitably added to this chain. bool isProfitableIncrement(const SCEV *OperExpr, const SCEV *IncExpr, @@ -1890,6 +1920,7 @@ LoopInfo &LI; const TargetTransformInfo &TTI; Loop *const L; + bool FavorBackedgeIndex = false; bool Changed = false; /// This is the insert position that the current loop's induction variable @@ -2804,7 +2835,7 @@ /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl &Users, - ScalarEvolution &SE, const TargetTransformInfo &TTI) { + ScalarEvolution &SE, bool FavorBackedgeIndex) { if (StressIVChain) return true; @@ -2870,6 +2901,9 @@ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost << "\n"); + if (FavorBackedgeIndex) + return cost <= 1; + return cost < 0; } @@ -3064,7 +3098,8 @@ for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) + ChainUsersVec[UsersIdx].FarUsers, SE, + FavorBackedgeIndex)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -3077,12 +3112,14 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); - LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - + LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.head() << "\n"); + for (const IVInc &Inc : Chain) { LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand"); + if (FavorBackedgeIndex && UseI->getUser() == Chain.lastNonPHI()) + continue; IVIncSet.insert(UseI); } } @@ -3738,10 +3775,11 @@ void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { - const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; - for (int64_t Offset : Worklist) { + + auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { Formula F = Base; F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. @@ -3761,7 +3799,32 @@ (void)InsertFormula(LU, LUIdx, F); } + }; + + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + + // With constant offsets and constant steps, we can generate post index + // accesses by having the offset equal the step. So, for access #0 with a + // step of 8, we could generate a G - 8 base which would require the first + // access to be ((G - 8) + 8),+,8. The post-indexed access would then update + // the pointer for itself in the next iteration. + if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) { + if (auto *GAR = dyn_cast(G)) { + if (auto *StepRec = + dyn_cast(GAR->getStepRecurrence(SE))) { + const APInt &StepInt = StepRec->getAPInt(); + int64_t Step = StepInt.isNegative() ? + StepInt.getSExtValue() : StepInt.getZExtValue(); + + for (int64_t Offset : Worklist) { + Offset -= Step; + GenerateOffset(G, Offset); + } + } + } } + for (int64_t Offset : Worklist) + GenerateOffset(G, Offset); int64_t Imm = ExtractImmediate(G, SE); if (G->isZero() || Imm == 0) @@ -4400,7 +4463,8 @@ /// When there are many registers for expressions like A, A+1, A+2, etc., /// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { - if (EstimateSearchSpaceComplexity() < ComplexityLimit) + if (!CollapseUnrolledCode && + EstimateSearchSpaceComplexity() < ComplexityLimit) return; LLVM_DEBUG( @@ -5361,7 +5425,9 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI) - : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) { + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), + FavorBackedgeIndex(EnableBackedgeIndexing && + TTI.shouldFavorBackedgeIndex(L)) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; Index: test/CodeGen/ARM/dsp-loop-indexing.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/dsp-loop-indexing.ll @@ -0,0 +1,324 @@ +; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-postincs=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 -lsr-collapse-unrolled=true %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT + +; CHECK-LABEL: test_qadd_2 +; CHECK: @ %loop +; TODO: pre-inc str + +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #8]! +; CHECK-DEAFULT: ldr{{.*}}, #8]! +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: add{{.*}}, #8 + +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: str{{.*}}, #8]! +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, #4] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: test_qadd_2_backwards +; TODO: Post increments should be generated. + +; CHECK: @ %loop + +; CHECK-DEFAULT: ldr{{.*}}, +; CHECK-DEFAULT: ldr{{.*}}, +; CHECK-DEFAULT: str{{.*}}, +; CHECK-DEFAULT: ldr{{.*}}, #-4] +; CHECK-DEFAULT: ldr{{.*}}, #-4] +; CHECK-DEFAULT: sub{{.*}}, #8 +; CHECK-DEFAULT: str{{.*}}, #-4] +; CHECK-DEFAULT: sub{{.*}}, #8 + +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: str{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: str{{.*}} lsl #2] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = sub nsw nuw i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = sub nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: test_qadd_3 +; CHECK: @ %loop + +; TODO: pre-inc str + +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #12]! +; CHECK-DEFAULT: ldr{{.*}}, #12]! +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: add{{.*}}, #12 + +; CHECK-COMPLEX: ldr{{.*}}, #12]! +; CHECK-COMPLEX: ldr{{.*}}, #12]! +; CHECK-COMPLEX: str{{.*}}, #12]! +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #8] +; CHECK-COMPLEX: ldr{{.*}}, #8] +; CHECK-COMPLEX: str{{.*}}, #8] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nuw nsw i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %idx.3 = add nuw nsw i32 %idx.1, 2 + %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3 + %a.3 = load i32, i32* %gep.a.3 + %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3 + %b.3 = load i32, i32* %gep.b.3 + %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3) + %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3 + store i32 %qadd.3, i32* %addr.3 + %i.next = add nsw nuw i32 %i, -3 + %idx.next = add nsw nuw i32 %idx.1, 3 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: test_qadd_4 +; CHECK: @ %loop + +; TODO: pre-inc store + +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #12] +; CHECK-DEFAULT: ldr{{.*}}, #12] +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; CHECK-COMPLEX: ldr{{.*}}, #16]! +; CHECK-COMPLEX: ldr{{.*}}, #16]! +; CHECK-COMPLEX: str{{.*}}, #16]! +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #8] +; CHECK-COMPLEX: ldr{{.*}}, #8] +; CHECK-COMPLEX: str{{.*}}, #8] +; CHECK-COMPLEX: ldr{{.*}}, #12] +; CHECK-COMPLEX: ldr{{.*}}, #12] +; CHECK-COMPLEX: str{{.*}}, #12] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %idx.3 = or i32 %idx.1, 2 + %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3 + %a.3 = load i32, i32* %gep.a.3 + %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3 + %b.3 = load i32, i32* %gep.b.3 + %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3) + %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3 + store i32 %qadd.3, i32* %addr.3 + %idx.4 = or i32 %idx.1, 3 + %gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4 + %a.4 = load i32, i32* %gep.a.4 + %gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4 + %b.4 = load i32, i32* %gep.b.4 + %qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4) + %addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4 + store i32 %qadd.4, i32* %addr.4 + %i.next = add nsw nuw i32 %i, -4 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: test_qadd16_2 +; CHECK: @ %loop +; TODO: pre-inc store. + +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #8]! +; CHECK-DEFAULT: ldr{{.*}}, #8]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: str{{.*}}, #16]! +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, #8] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1 + %cast.a.1 = bitcast i16* %gep.a.1 to i32* + %a.1 = load i32, i32* %cast.a.1 + %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1 + %cast.b.1 = bitcast i16* %gep.b.1 to i32* + %b.1 = load i32, i32* %cast.b.1 + %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nsw nuw i32 %idx.1, 2 + %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2 + %cast.a.2 = bitcast i16* %gep.a.2 to i32* + %a.2 = load i32, i32* %cast.a.2 + %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2 + %cast.b.2 = bitcast i16* %gep.b.2 to i32* + %b.2 = load i32, i32* %cast.b.2 + %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +declare i32 @llvm.arm.qadd(i32, i32) +declare i32 @llvm.arm.qadd16(i32, i32) Index: test/CodeGen/ARM/loop-align-cortex-m.ll =================================================================== --- test/CodeGen/ARM/loop-align-cortex-m.ll +++ test/CodeGen/ARM/loop-align-cortex-m.ll @@ -1,10 +1,10 @@ ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s -; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s define void @test_loop_alignment(i32* %in, i32* %out) optsize { ; CHECK-LABEL: test_loop_alignment: -; CHECK: movs {{r[0-9]+}}, #0 +; CHECK: mov{{.*}}, #0 ; CHECK: .p2align 2 entry: Index: test/CodeGen/ARM/loop-indexing.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/loop-indexing.ll @@ -0,0 +1,1236 @@ +; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BASE --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-postincs=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 -lsr-collapse-unrolled=true %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-COLLAPSE --check-prefix=CHECK-T2 + +; Tests to check that post increment addressing modes are used instead of +; updating base pointers with add instructions. + +; TODO: I think we should be able to use post inc addressing with VLDM +; instructions. +; CHECK-LABEL: test_fma +; CHECK: @ %loop + +; CHECK-BASE: vldr s{{.*}}, #8] +; CHECK-BASE: vldr s{{.*}}, #8] +; CHECK-BASE: vldr s{{.*}}, #12] +; CHECK-BASE: vldr s{{.*}}, #12] + +; CHECK-COMPLEX: vldr s{{.*}}, #8] +; CHECK-COMPLEX: vldr s{{.*}}, #8] +; CHECK-COMPLEX: vldr s{{.*}}, #12] +; CHECK-COMPLEX: vldr s{{.*}}, #12] + +; CHECK-COLLAPSE: vldr s{{.*}}, #4] +; CHECK-COLLAPSE: vldr s{{.*}}, #4] +; CHECK-COLLAPSE: vldr s{{.*}}, #8] +; CHECK-COLLAPSE: vldr s{{.*}}, #8] +define float @test_fma(float* %a, float* %b, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ] + %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1 + %a.1 = load float, float* %gep.a.1 + %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1 + %b.1 = load float, float* %gep.b.1 + %fmul.1 = fmul float %a.1, %b.1 + %fma.1 = fadd float %fmul.1, %res + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2 + %a.2 = load float, float* %gep.a.2 + %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2 + %b.2 = load float, float* %gep.b.2 + %fmul.2 = fmul float %a.2, %b.2 + %fma.2 = fadd float %fmul.2, %fma.1 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret float %fma.2 +} + +; CHECK-LABEL: convolve_16bit + +; TODO: Generate pre-incs without higher complexity limit +; CHECK-DEFAULT: ldr{{.*}}, #6] +; CHECK-DEFAULT: ldr{{.*}}, #6] +; CHECK-DEFAULT: ldr{{.*}}, #2] +; CHECK-DEFAULT: ldr{{.*}}, #2] + +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: ldr{{.*}}, #8]! +; CHECK-COMPLEX: ldr{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, #4] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, + i32 %filter_dim, i32 %out_width, i32 %out_height, + i32** nocapture readonly %convolved) { +entry: + %cmp92 = icmp eq i32 %out_height, 0 + br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %xtraiter = and i32 %filter_dim, 3 + %unroll_iter = sub i32 %filter_dim, %xtraiter + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph + %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ] + %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093 + %tmp3 = load i32*, i32** %arrayidx22, align 4 + br label %for.cond9.preheader.us.us.preheader + +for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph + %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ] + br label %for.cond9.preheader.us.us + +for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader + %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] + %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] + %add.us.us = add i32 %filter_y.056.us.us, %res_y.093 + %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us + %tmp5 = load i16*, i16** %arrayidx.us.us, align 4 + %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us + %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4 + br label %for.body12.us.us + +for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us + %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] + %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ] + %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us + %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us + %tmp9 = load i16, i16* %arrayidx14.us.us, align 2 + %conv.us.us = sext i16 %tmp9 to i32 + %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us + %tmp10 = load i16, i16* %arrayidx16.us.us, align 2 + %conv17.us.us = sext i16 %tmp10 to i32 + %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us + %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us + %inc.us.us = or i32 %filter_x.053.us.us, 1 + %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us + %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us + %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2 + %conv.us.us.1 = sext i16 %tmp11 to i32 + %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1 + %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2 + %conv17.us.us.1 = sext i16 %tmp12 to i32 + %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1 + %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us + %inc.us.us.1 = or i32 %filter_x.053.us.us, 2 + %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us + %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1 + %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2 + %conv.us.us.2 = sext i16 %tmp13 to i32 + %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2 + %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2 + %conv17.us.us.2 = sext i16 %tmp14 to i32 + %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2 + %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1 + %inc.us.us.2 = or i32 %filter_x.053.us.us, 3 + %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us + %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2 + %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2 + %conv.us.us.3 = sext i16 %tmp15 to i32 + %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3 + %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2 + %conv17.us.us.3 = sext i16 %tmp16 to i32 + %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3 + %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2 + %inc.us.us.3 = add i32 %filter_x.053.us.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us + +for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us + %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1 + %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim + br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us + +for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us + %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us + store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4 + %add25.us = add nuw i32 %res_x.060.us, 1 + %exitcond99 = icmp eq i32 %add25.us, %out_width + br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader + +for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader + %add28 = add nuw i32 %res_y.093, 1 + %exitcond100 = icmp eq i32 %add28, %out_height + br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void +} + +; CHECK-LABEL: mul_8x8 +; CHECK: @ %for.body +; TODO: pre-inc store. + +; CHECK-DEFAULT: ldrb{{.*}}, #1] +; CHECK-DEFAULT: ldrb{{.*}}, #1] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldrb{{.*}}, #2] +; CHECK-DEFAULT: ldrb{{.*}}, #2] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldrb{{.*}}, #3] +; CHECK-DEFAULT: ldrb{{.*}}, #3] +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: ldrb{{.*}}, #4]! +; CHECK-DEFAULT: ldrb{{.*}}, #4]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; CHECK-COMPLEX: ldrb +; CHECK-COMPLEX: ldrb +; CHECK-COMPLEX: str +; CHECK-COMPLEX: ldrb{{.*}}, #1] +; CHECK-COMPLEX: ldrb{{.*}}, #1] +; CHECK-COMPLEX: str{{.*}}, #4] +; CHECK-COMPLEX: ldrb{{.*}}, #2] +; CHECK-COMPLEX: ldrb{{.*}}, #2] +; CHECK-COMPLEX: str{{.*}}, #8] +; CHECK-COMPLEX: ldrb{{.*}}, #3] +; CHECK-COMPLEX: ldrb{{.*}}, #3] +; CHECK-COMPLEX: str{{.*}}, #12] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body.epil +; CHECK-T2: ldrb{{.*}}, #1]! +; CHECK-T2: ldrb{{.*}}, #1]! +; CHECK-T2: str{{.*}}, #4]! + +define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) { +entry: + %cmp9 = icmp eq i32 %N, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %tmp = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %tmp1 = icmp ult i32 %tmp, 3 + br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa + %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil + %tmp2 = load i8, i8* %arrayidx.epil, align 1 + %conv.epil = zext i8 %tmp2 to i32 + %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil + %tmp3 = load i8, i8* %arrayidx1.epil, align 1 + %conv2.epil = zext i8 %tmp3 to i32 + %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil + %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil + store i32 %mul.epil, i32* %arrayidx3.epil, align 4 + %inc.epil = add nuw i32 %i.010.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010 + %tmp4 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %tmp4 to i32 + %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010 + %tmp5 = load i8, i8* %arrayidx1, align 1 + %conv2 = zext i8 %tmp5 to i32 + %mul = mul nuw nsw i32 %conv2, %conv + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 + store i32 %mul, i32* %arrayidx3, align 4 + %inc = or i32 %i.010, 1 + %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc + %tmp6 = load i8, i8* %arrayidx.1, align 1 + %conv.1 = zext i8 %tmp6 to i32 + %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc + %tmp7 = load i8, i8* %arrayidx1.1, align 1 + %conv2.1 = zext i8 %tmp7 to i32 + %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 + %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc + store i32 %mul.1, i32* %arrayidx3.1, align 4 + %inc.1 = or i32 %i.010, 2 + %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1 + %tmp8 = load i8, i8* %arrayidx.2, align 1 + %conv.2 = zext i8 %tmp8 to i32 + %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1 + %tmp9 = load i8, i8* %arrayidx1.2, align 1 + %conv2.2 = zext i8 %tmp9 to i32 + %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2 + %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 + store i32 %mul.2, i32* %arrayidx3.2, align 4 + %inc.2 = or i32 %i.010, 3 + %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2 + %tmp10 = load i8, i8* %arrayidx.3, align 1 + %conv.3 = zext i8 %tmp10 to i32 + %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2 + %tmp11 = load i8, i8* %arrayidx1.3, align 1 + %conv2.3 = zext i8 %tmp11 to i32 + %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3 + %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 + store i32 %mul.3, i32* %arrayidx3.3, align 4 + %inc.3 = add i32 %i.010, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} + +; CHECK-LABEL: mul_16x8 +; CHECK: @ %for.body + +; TODO: pre-inc store +; CHECK-DEFAULT: ldrsh{{.*}}, #2] +; CHECK-DEFAULT: ldrb{{.*}}, #1] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldrsh{{.*}}, #4] +; CHECK-DEFAULT: ldrb{{.*}}, #2] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldrsh{{.*}}, #6] +; CHECK-DEFAULT: ldrb{{.*}}, #3] +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: ldrsh{{.*}}, #8]! +; CHECK-DEFAULT: ldrb{{.*}}, #4]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; CHECK-COMPLEX: ldrsh +; CHECK-COMPLEX: ldrb +; CHECK-COMPLEX: str +; CHECK-COMPLEX: ldrsh{{.*}}, #2] +; CHECK-COMPLEX: ldrb{{.*}}, #1] +; CHECK-COMPLEX: str{{.*}}, #4] +; CHECK-COMPLEX: ldrsh{{.*}}, #4] +; CHECK-COMPLEX: ldrb{{.*}}, #2] +; CHECK-COMPLEX: str{{.*}}, #8] +; CHECK-COMPLEX: ldrsh{{.*}}, #6] +; CHECK-COMPLEX: ldrb{{.*}}, #3] +; CHECK-COMPLEX: str{{.*}}, #12] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body.epil +; CHECK-T2: ldrsh{{.*}}, #2]! +; CHECK-T2: ldrb{{.*}}, #1]! +; CHECK-T2: str{{.*}}, #4]! + +define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) { +entry: + %cmp9 = icmp eq i32 %N, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %tmp = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %tmp1 = icmp ult i32 %tmp, 3 + br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa + %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil + %tmp2 = load i16, i16* %arrayidx.epil, align 2 + %conv.epil = sext i16 %tmp2 to i32 + %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil + %tmp3 = load i8, i8* %arrayidx1.epil, align 1 + %conv2.epil = zext i8 %tmp3 to i32 + %mul.epil = mul nsw i32 %conv2.epil, %conv.epil + %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil + store i32 %mul.epil, i32* %arrayidx3.epil, align 4 + %inc.epil = add nuw i32 %i.010.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 + %tmp4 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %tmp4 to i32 + %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010 + %tmp5 = load i8, i8* %arrayidx1, align 1 + %conv2 = zext i8 %tmp5 to i32 + %mul = mul nsw i32 %conv2, %conv + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 + store i32 %mul, i32* %arrayidx3, align 4 + %inc = or i32 %i.010, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %tmp6 = load i16, i16* %arrayidx.1, align 2 + %conv.1 = sext i16 %tmp6 to i32 + %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc + %tmp7 = load i8, i8* %arrayidx1.1, align 1 + %conv2.1 = zext i8 %tmp7 to i32 + %mul.1 = mul nsw i32 %conv2.1, %conv.1 + %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc + store i32 %mul.1, i32* %arrayidx3.1, align 4 + %inc.1 = or i32 %i.010, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %tmp8 = load i16, i16* %arrayidx.2, align 2 + %conv.2 = sext i16 %tmp8 to i32 + %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1 + %tmp9 = load i8, i8* %arrayidx1.2, align 1 + %conv2.2 = zext i8 %tmp9 to i32 + %mul.2 = mul nsw i32 %conv2.2, %conv.2 + %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 + store i32 %mul.2, i32* %arrayidx3.2, align 4 + %inc.2 = or i32 %i.010, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %tmp10 = load i16, i16* %arrayidx.3, align 2 + %conv.3 = sext i16 %tmp10 to i32 + %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2 + %tmp11 = load i8, i8* %arrayidx1.3, align 1 + %conv2.3 = zext i8 %tmp11 to i32 + %mul.3 = mul nsw i32 %conv2.3, %conv.3 + %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 + store i32 %mul.3, i32* %arrayidx3.3, align 4 + %inc.3 = add i32 %i.010, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} + +; CHECK-LABEL: mul_16x16 +; CHECK: @ %for.body + +; TODO: pre-inc store +; CHECK-DEFAULT: ldrsh{{.*}}, #2] +; CHECK-DEFAULT: ldrsh{{.*}}, #2] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldrsh{{.*}}, #4] +; CHECK-DEFAULT: ldrsh{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldrsh{{.*}}, #6] +; CHECK-DEFAULT: ldrsh{{.*}}, #6] +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: ldrsh{{.*}}, #8]! +; CHECK-DEFAULT: ldrsh{{.*}}, #8]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; CHECK-COMPLEX: ldrsh +; CHECK-COMPLEX: ldrsh +; CHECK-COMPLEX: str +; CHECK-COMPLEX: ldrsh{{.*}}, #2] +; CHECK-COMPLEX: ldrsh{{.*}}, #2] +; CHECK-COMPLEX: str{{.*}}, #4] +; CHECK-COMPLEX: ldrsh{{.*}}, #4] +; CHECK-COMPLEX: ldrsh{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, #8] +; CHECK-COMPLEX: ldrsh{{.*}}, #6] +; CHECK-COMPLEX: ldrsh{{.*}}, #6] +; CHECK-COMPLEX: str{{.*}}, #12] + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body.epil +; CHECK-T2: ldrsh{{.*}}, #2]! +; CHECK-T2: ldrsh{{.*}}, #2]! +; CHECK-T2: str{{.*}}, #4]! + +define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) { +entry: + %cmp9 = icmp eq i32 %N, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %tmp = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %tmp1 = icmp ult i32 %tmp, 3 + br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa + %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil + %tmp2 = load i16, i16* %arrayidx.epil, align 2 + %conv.epil = sext i16 %tmp2 to i32 + %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil + %tmp3 = load i16, i16* %arrayidx1.epil, align 2 + %conv2.epil = sext i16 %tmp3 to i32 + %mul.epil = mul nsw i32 %conv2.epil, %conv.epil + %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil + store i32 %mul.epil, i32* %arrayidx3.epil, align 4 + %inc.epil = add nuw i32 %i.010.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 + %tmp4 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %tmp4 to i32 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010 + %tmp5 = load i16, i16* %arrayidx1, align 2 + %conv2 = sext i16 %tmp5 to i32 + %mul = mul nsw i32 %conv2, %conv + %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 + store i32 %mul, i32* %arrayidx3, align 4 + %inc = or i32 %i.010, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %tmp6 = load i16, i16* %arrayidx.1, align 2 + %conv.1 = sext i16 %tmp6 to i32 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %tmp7 = load i16, i16* %arrayidx1.1, align 2 + %conv2.1 = sext i16 %tmp7 to i32 + %mul.1 = mul nsw i32 %conv2.1, %conv.1 + %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc + store i32 %mul.1, i32* %arrayidx3.1, align 4 + %inc.1 = or i32 %i.010, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %tmp8 = load i16, i16* %arrayidx.2, align 2 + %conv.2 = sext i16 %tmp8 to i32 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %tmp9 = load i16, i16* %arrayidx1.2, align 2 + %conv2.2 = sext i16 %tmp9 to i32 + %mul.2 = mul nsw i32 %conv2.2, %conv.2 + %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 + store i32 %mul.2, i32* %arrayidx3.2, align 4 + %inc.2 = or i32 %i.010, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %tmp10 = load i16, i16* %arrayidx.3, align 2 + %conv.3 = sext i16 %tmp10 to i32 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %tmp11 = load i16, i16* %arrayidx1.3, align 2 + %conv2.3 = sext i16 %tmp11 to i32 + %mul.3 = mul nsw i32 %conv2.3, %conv.3 + %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 + store i32 %mul.3, i32* %arrayidx3.3, align 4 + %inc.3 = add i32 %i.010, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} + +; CHECK-LABEL: mul_8x8_2d +; CHECK: @ %for.body4.us + +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: ldrb{{.*}} +; CHECK-DEFAULT: ldrb{{.*}}, #1] +; CHECK-DEFAULT: str{{.*}}, #-12] +; CHECK-DEFAULT: ldrb{{.*}} +; CHECK-DEFAULT: ldrb{{.*}}, #2] +; CHECK-DEFAULT: str{{.*}}, #-8] +; CHECK-DEFAULT: ldrb{{.*}} +; CHECK-DEFAULT: ldrb{{.*}}, #3] +; CHECK-DEFAULT: str{{.*}}, #-4] +; CHECK-DEFAULT: ldrb{{.*}} +; CHECK-DEFAULT: ldrb{{.*}}, #4]! + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body4.us.epil +; CHECK-T2: ldrb{{.*}}, #1]! +; CHECK-T2: ldr{{.*}}, #4]! + +define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) { +entry: + %cmp24 = icmp eq i32 %N, 0 + %cmp222 = icmp eq i32 %M, 0 + %or.cond = or i1 %cmp24, %cmp222 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader + +for.cond1.preheader.us.preheader: ; preds = %entry + %tmp = add i32 %M, -1 + %xtraiter = and i32 %M, 3 + %tmp1 = icmp ult i32 %tmp, 3 + %unroll_iter = sub i32 %M, %xtraiter + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us + %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us + %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us + %.pre = load i8*, i8** %arrayidx5.us, align 4 + %.pre30 = load i32*, i32** %arrayidx8.us, align 4 + br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] + %tmp2 = load i8, i8* %arrayidx.us, align 1 + %conv.us = zext i8 %tmp2 to i32 + %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us + %tmp3 = load i8, i8* %arrayidx6.us, align 1 + %conv7.us = zext i8 %tmp3 to i32 + %mul.us = mul nuw nsw i32 %conv7.us, %conv.us + %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us + %tmp4 = load i32, i32* %arrayidx9.us, align 4 + %add.us = add nsw i32 %tmp4, %mul.us + store i32 %add.us, i32* %arrayidx9.us, align 4 + %inc.us = or i32 %j.023.us, 1 + %tmp5 = load i8, i8* %arrayidx.us, align 1 + %conv.us.1 = zext i8 %tmp5 to i32 + %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us + %tmp6 = load i8, i8* %arrayidx6.us.1, align 1 + %conv7.us.1 = zext i8 %tmp6 to i32 + %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 + %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us + %tmp7 = load i32, i32* %arrayidx9.us.1, align 4 + %add.us.1 = add nsw i32 %tmp7, %mul.us.1 + store i32 %add.us.1, i32* %arrayidx9.us.1, align 4 + %inc.us.1 = or i32 %j.023.us, 2 + %tmp8 = load i8, i8* %arrayidx.us, align 1 + %conv.us.2 = zext i8 %tmp8 to i32 + %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1 + %tmp9 = load i8, i8* %arrayidx6.us.2, align 1 + %conv7.us.2 = zext i8 %tmp9 to i32 + %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 + %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1 + %tmp10 = load i32, i32* %arrayidx9.us.2, align 4 + %add.us.2 = add nsw i32 %tmp10, %mul.us.2 + store i32 %add.us.2, i32* %arrayidx9.us.2, align 4 + %inc.us.2 = or i32 %j.023.us, 3 + %tmp11 = load i8, i8* %arrayidx.us, align 1 + %conv.us.3 = zext i8 %tmp11 to i32 + %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2 + %tmp12 = load i8, i8* %arrayidx6.us.3, align 1 + %conv7.us.3 = zext i8 %tmp12 to i32 + %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 + %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2 + %tmp13 = load i32, i32* %arrayidx9.us.3, align 4 + %add.us.3 = add nsw i32 %tmp13, %mul.us.3 + store i32 %add.us.3, i32* %arrayidx9.us.3, align 4 + %inc.us.3 = add i32 %j.023.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us + %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] + br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %tmp14 = load i8, i8* %arrayidx.us, align 1 + %conv.us.epil = zext i8 %tmp14 to i32 + %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil + %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1 + %conv7.us.epil = zext i8 %tmp15 to i32 + %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil + %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil + %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4 + %add.us.epil = add nsw i32 %tmp16, %mul.us.epil + store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4 + %inc.us.epil = add nuw i32 %j.023.us.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %inc11.us = add nuw i32 %i.025.us, 1 + %exitcond28 = icmp eq i32 %inc11.us, %N + br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + ret void +} + +; CHECK-LABEL: mul_16x16_2d +; CHECK: @ %for.body4.us + +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: ldrsh{{.*}}, #8]! + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body4.us.epil +; CHECK-T2: ldrsh{{.*}}, #2]! +; CHECK-T2: ldr{{.*}}, #4]! + +define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) { +entry: + %cmp24 = icmp eq i32 %N, 0 + %cmp222 = icmp eq i32 %M, 0 + %or.cond = or i1 %cmp24, %cmp222 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader + +for.cond1.preheader.us.preheader: ; preds = %entry + %tmp = add i32 %M, -1 + %xtraiter = and i32 %M, 3 + %tmp1 = icmp ult i32 %tmp, 3 + %unroll_iter = sub i32 %M, %xtraiter + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us + %tmp2 = load i16, i16* %arrayidx.us, align 2 + %conv.us = sext i16 %tmp2 to i32 + %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us + %tmp3 = load i16*, i16** %arrayidx5.us, align 4 + %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us + %tmp4 = load i32*, i32** %arrayidx8.us, align 4 + br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] + %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us + %tmp5 = load i16, i16* %arrayidx6.us, align 2 + %conv7.us = sext i16 %tmp5 to i32 + %mul.us = mul nsw i32 %conv7.us, %conv.us + %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us + %tmp6 = load i32, i32* %arrayidx9.us, align 4 + %add.us = add nsw i32 %tmp6, %mul.us + store i32 %add.us, i32* %arrayidx9.us, align 4 + %inc.us = or i32 %j.023.us, 1 + %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us + %tmp7 = load i16, i16* %arrayidx6.us.1, align 2 + %conv7.us.1 = sext i16 %tmp7 to i32 + %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us + %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us + %tmp8 = load i32, i32* %arrayidx9.us.1, align 4 + %add.us.1 = add nsw i32 %tmp8, %mul.us.1 + store i32 %add.us.1, i32* %arrayidx9.us.1, align 4 + %inc.us.1 = or i32 %j.023.us, 2 + %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1 + %tmp9 = load i16, i16* %arrayidx6.us.2, align 2 + %conv7.us.2 = sext i16 %tmp9 to i32 + %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us + %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1 + %tmp10 = load i32, i32* %arrayidx9.us.2, align 4 + %add.us.2 = add nsw i32 %tmp10, %mul.us.2 + store i32 %add.us.2, i32* %arrayidx9.us.2, align 4 + %inc.us.2 = or i32 %j.023.us, 3 + %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2 + %tmp11 = load i16, i16* %arrayidx6.us.3, align 2 + %conv7.us.3 = sext i16 %tmp11 to i32 + %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us + %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2 + %tmp12 = load i32, i32* %arrayidx9.us.3, align 4 + %add.us.3 = add nsw i32 %tmp12, %mul.us.3 + store i32 %add.us.3, i32* %arrayidx9.us.3, align 4 + %inc.us.3 = add i32 %j.023.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us + %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] + br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil + %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2 + %conv7.us.epil = sext i16 %tmp13 to i32 + %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us + %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil + %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4 + %add.us.epil = add nsw i32 %tmp14, %mul.us.epil + store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4 + %inc.us.epil = add nuw i32 %j.023.us.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %inc11.us = add nuw i32 %i.025.us, 1 + %exitcond28 = icmp eq i32 %inc11.us, %N + br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + ret void +} + +; CHECK-LABEL: mac_8x8_2d +; CHECK: @ %for.body4.us + +; CHECK-BASE: ldrb{{.*}} +; CHECK-BASE: ldrb{{.*}} +; CHECK-BASE: str{{.*}}, lsl #2] +; CHECK-BASE: ldrb{{.*}} +; CHECK-BASE: ldrb{{.*}}, #1] +; CHECK-BASE: str{{.*}}, lsl #2] +; CHECK-BASE: ldrb{{.*}} +; CHECK-BASE: ldrb{{.*}}, #2] +; CHECK-BASE: str{{.*}}, lsl #2] +; CHECK-BASE: ldrb{{.*}} +; CHECK-BASE: ldrb{{.*}}, #3] +; CHECK-BASE: str{{.*}}, lsl #2] + +; CHECK-COMPLEX: ldrb{{.*}} +; CHECK-COMPLEX: ldrb{{.*}} +; CHECK-COMPLEX: str{{.*}}, lsl #2] +; CHECK-COMPLEX: ldrb{{.*}} +; CHECK-COMPLEX: ldrb{{.*}}, #1] +; CHECK-COMPLEX: str{{.*}}, lsl #2] +; CHECK-COMPLEX: ldrb{{.*}} +; CHECK-COMPLEX: ldrb{{.*}}, #2] +; CHECK-COMPLEX: str{{.*}}, lsl #2] +; CHECK-COMPLEX: ldrb{{.*}} +; CHECK-COMPLEX: ldrb{{.*}}, #3] +; CHECK-COMPLEX: str{{.*}}, lsl #2] + +; CHECK-COLLAPSE: ldrb{{.*}}, +; CHECK-COLLAPSE: ldrb{{.*}}, #1] +; CHECK-COLLAPSE: str{{.*}}, lsl #2] +; CHECK-COLLAPSE: ldrb{{.*}}, #2] +; CHECK-COLLAPSE: str{{.*}}, lsl #2] +; CHECK-COLLAPSE: ldrb{{.*}}, #3] +; CHECK-COLLAPSE: str{{.*}}, lsl #2] +; CHECK-COLLAPSE: ldrb{{.*}}, #4]! + +; DISABLED-NOT: ldr{{.*}}]! +; DISABLED-NOT: str{{.*}}]! + +; CHECK-T2: @ %for.body4.us.epil +; CHECK-T2: ldrb{{.*}}, #1]! + +define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) { +entry: + %cmp22 = icmp eq i32 %N, 0 + %cmp220 = icmp eq i32 %M, 0 + %or.cond = or i1 %cmp22, %cmp220 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader + +for.cond1.preheader.us.preheader: ; preds = %entry + %tmp = add i32 %M, -1 + %xtraiter = and i32 %M, 3 + %tmp1 = icmp ult i32 %tmp, 3 + %unroll_iter = sub i32 %M, %xtraiter + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us + %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us + %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us + %.pre = load i8*, i8** %arrayidx5.us, align 4 + %.pre28 = load i32, i32* %arrayidx8.us, align 4 + br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ] + %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] + %tmp3 = load i8, i8* %arrayidx.us, align 1 + %conv.us = zext i8 %tmp3 to i32 + %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us + %tmp4 = load i8, i8* %arrayidx6.us, align 1 + %conv7.us = zext i8 %tmp4 to i32 + %mul.us = mul nuw nsw i32 %conv7.us, %conv.us + %add.us = add nsw i32 %mul.us, %tmp2 + store i32 %add.us, i32* %arrayidx8.us, align 4 + %inc.us = or i32 %j.021.us, 1 + %tmp5 = load i8, i8* %arrayidx.us, align 1 + %conv.us.1 = zext i8 %tmp5 to i32 + %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us + %tmp6 = load i8, i8* %arrayidx6.us.1, align 1 + %conv7.us.1 = zext i8 %tmp6 to i32 + %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 + %add.us.1 = add nsw i32 %mul.us.1, %add.us + store i32 %add.us.1, i32* %arrayidx8.us, align 4 + %inc.us.1 = or i32 %j.021.us, 2 + %tmp7 = load i8, i8* %arrayidx.us, align 1 + %conv.us.2 = zext i8 %tmp7 to i32 + %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1 + %tmp8 = load i8, i8* %arrayidx6.us.2, align 1 + %conv7.us.2 = zext i8 %tmp8 to i32 + %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 + %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 + store i32 %add.us.2, i32* %arrayidx8.us, align 4 + %inc.us.2 = or i32 %j.021.us, 3 + %tmp9 = load i8, i8* %arrayidx.us, align 1 + %conv.us.3 = zext i8 %tmp9 to i32 + %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2 + %tmp10 = load i8, i8* %arrayidx6.us.3, align 1 + %conv7.us.3 = zext i8 %tmp10 to i32 + %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 + %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 + store i32 %add.us.3, i32* %arrayidx8.us, align 4 + %inc.us.3 = add i32 %j.021.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us + %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] + %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] + br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %tmp12 = load i8, i8* %arrayidx.us, align 1 + %conv.us.epil = zext i8 %tmp12 to i32 + %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil + %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1 + %conv7.us.epil = zext i8 %tmp13 to i32 + %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil + %add.us.epil = add nsw i32 %mul.us.epil, %tmp11 + store i32 %add.us.epil, i32* %arrayidx8.us, align 4 + %inc.us.epil = add nuw i32 %j.021.us.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %inc10.us = add nuw i32 %i.023.us, 1 + %exitcond26 = icmp eq i32 %inc10.us, %N + br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + ret void +} + +; CHECK-LABEL: mac_16x16_2d +; CHECK: @ %for.body4.us + +; CHECK-BASE: ldrsh{{.*}}, lsl #1] +; CHECK-BASE: ldrsh{{.*}}, #2] +; CHECK-BASE: ldrsh{{.*}}, #4] +; CHECK-BASE: ldrsh{{.*}}, #6] + +; CHECK-COMPLEX: ldrsh{{.*}}, lsl #1] +; CHECK-COMPLEX: ldrsh{{.*}}, #2] +; CHECK-COMPLEX: ldrsh{{.*}}, #4] +; CHECK-COMPLEX: ldrsh{{.*}}, #6] + +; CHECK-COLLAPSE: ldrsh{{.*}}, #8]! + +; DISABLED-NOT: ldr{{.*}}]! + +; CHECK-T2: @ %for.body4.us.epil +; CHECK-T2: ldrsh{{.*}}, #2]! + +define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) { +entry: + %cmp23 = icmp eq i32 %N, 0 + %cmp220 = icmp eq i32 %M, 0 + %or.cond = or i1 %cmp23, %cmp220 + br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader + +for.cond1.preheader.us.preheader: ; preds = %entry + %tmp = add i32 %M, -1 + %xtraiter = and i32 %M, 3 + %tmp1 = icmp ult i32 %tmp, 3 + %unroll_iter = sub i32 %M, %xtraiter + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader + %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us + %tmp2 = load i16, i16* %arrayidx.us, align 2 + %conv.us = sext i16 %tmp2 to i32 + %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us + %tmp3 = load i16*, i16** %arrayidx5.us, align 4 + %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us + %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4 + br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ] + %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] + %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us + %tmp4 = load i16, i16* %arrayidx6.us, align 2 + %conv7.us = sext i16 %tmp4 to i32 + %mul.us = mul nsw i32 %conv7.us, %conv.us + %add.us = add nsw i32 %mul.us, %add22.us + %inc.us = or i32 %j.021.us, 1 + %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us + %tmp5 = load i16, i16* %arrayidx6.us.1, align 2 + %conv7.us.1 = sext i16 %tmp5 to i32 + %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us + %add.us.1 = add nsw i32 %mul.us.1, %add.us + %inc.us.1 = or i32 %j.021.us, 2 + %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1 + %tmp6 = load i16, i16* %arrayidx6.us.2, align 2 + %conv7.us.2 = sext i16 %tmp6 to i32 + %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us + %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 + %inc.us.2 = or i32 %j.021.us, 3 + %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2 + %tmp7 = load i16, i16* %arrayidx6.us.3, align 2 + %conv7.us.3 = sext i16 %tmp7 to i32 + %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us + %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 + %inc.us.3 = add i32 %j.021.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us + %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] + %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] + %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] + br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] + %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil + %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2 + %conv7.us.epil = sext i16 %tmp8 to i32 + %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us + %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil + %inc.us.epil = add nuw i32 %j.021.us.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa + %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ] + store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4 + %inc10.us = add nuw i32 %i.024.us, 1 + %exitcond27 = icmp eq i32 %inc10.us, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + ret void +} + +; CHECK-LABEL: mul32x32_backwards +; CHECK: @ %for.body + +; TODO: post increments for decreasing addresses +; CHECK-DEFAULT-NOT: ldr{{.*}}]! +; CHECK-DEFAULT-NOT: str{{.*}}]! + +; CHECK-COMPLEX-NOT: ldr{{.*}}]! +; CHECK-COMPLEX-NOT: str{{.*}}]! + +define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { +entry: + %i.08 = add i32 %N, -1 + %cmp9 = icmp sgt i32 %i.08, -1 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %xtraiter = and i32 %N, 3 + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol + +for.body.prol: ; preds = %for.body.prol, %for.body.preheader + %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ] + %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ] + %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol + %tmp = load i32, i32* %arrayidx.prol, align 4 + %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol + %tmp1 = load i32, i32* %arrayidx1.prol, align 4 + %mul.prol = mul nsw i32 %tmp1, %tmp + %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol + store i32 %mul.prol, i32* %arrayidx2.prol, align 4 + %i.0.prol = add i32 %i.010.prol, -1 + %prol.iter.sub = add i32 %prol.iter, -1 + %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 + br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol + +for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader + %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ] + %tmp2 = icmp ult i32 %i.08, 3 + br i1 %tmp2, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %for.body.prol.loopexit, %entry + ret void + +for.body: ; preds = %for.body, %for.body.prol.loopexit + %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010 + %tmp3 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010 + %tmp4 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %tmp4, %tmp3 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010 + store i32 %mul, i32* %arrayidx2, align 4 + %i.0 = add i32 %i.010, -1 + %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0 + %tmp5 = load i32, i32* %arrayidx.1, align 4 + %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0 + %tmp6 = load i32, i32* %arrayidx1.1, align 4 + %mul.1 = mul nsw i32 %tmp6, %tmp5 + %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0 + store i32 %mul.1, i32* %arrayidx2.1, align 4 + %i.0.1 = add i32 %i.010, -2 + %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1 + %tmp7 = load i32, i32* %arrayidx.2, align 4 + %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1 + %tmp8 = load i32, i32* %arrayidx1.2, align 4 + %mul.2 = mul nsw i32 %tmp8, %tmp7 + %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1 + store i32 %mul.2, i32* %arrayidx2.2, align 4 + %i.0.2 = add i32 %i.010, -3 + %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2 + %tmp9 = load i32, i32* %arrayidx.3, align 4 + %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2 + %tmp10 = load i32, i32* %arrayidx1.3, align 4 + %mul.3 = mul nsw i32 %tmp10, %tmp9 + %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2 + store i32 %mul.3, i32* %arrayidx2.3, align 4 + %i.0.3 = add i32 %i.010, -4 + %cmp.3 = icmp sgt i32 %i.0.3, -1 + br i1 %cmp.3, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: mul32x32_forwards +; CHECK: @ %for.body + +; TODO: pre-inc store +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #4] +; CHECK-DEFAULT: str{{.*}}, #4] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #8] +; CHECK-DEFAULT: str{{.*}}, #8] +; CHECK-DEFAULT: ldr{{.*}}, #12] +; CHECK-DEFAULT: ldr{{.*}}, #12] +; CHECK-DEFAULT: str{{.*}}, #12] +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: ldr{{.*}}, #16]! +; CHECK-DEFAULT: str{{.*}}, #16] +; CHECK-DEFAULT: add{{.*}}, #16 + +; TODO: Higher complexity results in 22 instructions vs 20. +; CHECK-COMPLEX-NOT: ldr{{.*}}, #16]! +; CHECK-COMPLEX-NOT: str{{.*}}, #16]! + +; CHECK-T2: @ %for.body.epil +; CHECK-T2: ldr{{.*}}, #4]! +; CHECK-T2: ldr{{.*}}, #4]! +; CHECK-T2: str{{.*}}, #4]! + +define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %tmp = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %tmp1 = icmp ult i32 %tmp, 3 + br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa + %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil + %tmp2 = load i32, i32* %arrayidx.epil, align 4 + %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil + %tmp3 = load i32, i32* %arrayidx1.epil, align 4 + %mul.epil = mul nsw i32 %tmp3, %tmp2 + %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil + store i32 %mul.epil, i32* %arrayidx2.epil, align 4 + %inc.epil = add nuw nsw i32 %i.09.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09 + %tmp4 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09 + %tmp5 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %tmp5, %tmp4 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = or i32 %i.09, 1 + %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc + %tmp6 = load i32, i32* %arrayidx.1, align 4 + %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc + %tmp7 = load i32, i32* %arrayidx1.1, align 4 + %mul.1 = mul nsw i32 %tmp7, %tmp6 + %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc + store i32 %mul.1, i32* %arrayidx2.1, align 4 + %inc.1 = or i32 %i.09, 2 + %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1 + %tmp8 = load i32, i32* %arrayidx.2, align 4 + %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1 + %tmp9 = load i32, i32* %arrayidx1.2, align 4 + %mul.2 = mul nsw i32 %tmp9, %tmp8 + %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1 + store i32 %mul.2, i32* %arrayidx2.2, align 4 + %inc.2 = or i32 %i.09, 3 + %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2 + %tmp10 = load i32, i32* %arrayidx.3, align 4 + %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2 + %tmp11 = load i32, i32* %arrayidx1.3, align 4 + %mul.3 = mul nsw i32 %tmp11, %tmp10 + %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2 + store i32 %mul.3, i32* %arrayidx2.3, align 4 + %inc.3 = add nuw nsw i32 %i.09, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} Index: test/Transforms/LoopStrengthReduce/ARM/complexity.ll =================================================================== --- test/Transforms/LoopStrengthReduce/ARM/complexity.ll +++ test/Transforms/LoopStrengthReduce/ARM/complexity.ll @@ -1,21 +1,25 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT -; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX +; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s +; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s -; CHECK-DEFAULT-LABEL: for.body12.us.us: -; CHECK-DEFAULT: phi i32 -; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] -; CHECK-DEFAULT: phi i32 -; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8 +; CHECK-LABEL: for.cond9.preheader.us.us: +; CHECK: [[SCEVGEP:%[^ ]+]] = getelementptr i16, i16* %tmp5, i32 -4 +; CHECK: [[SCEVGEP9:%[^ ]+]] = getelementptr i16, i16* %tmp6, i32 %lsr.iv -; CHECK-COMPLEX-LABEL: for.body12.us.us: -; CHECK-COMPLEX: phi i32 -; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ] -; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ] -; CHECK-COMPLEX: phi i32 -; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4 -; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4 +; CHECK-LABEL: for.body12.us.us: +; CHECK: [[LSR_IV10:%[^ ]+]] = phi i16* [ [[SCEVGEP11:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP9]], %for.cond9.preheader.us.us ] +; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP]], %for.cond9.preheader.us.us ] +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 4 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 5 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 5 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 6 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 6 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 7 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 7 +; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK: [[SCEVGEP11]] = getelementptr i16, i16* [[LSR_IV10]], i32 4 define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) { entry: