Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -487,6 +487,8 @@ /// addressing mode expressions. bool shouldFavorPostInc() const; + bool shouldFavorCrossIterationPostInc() const; + /// Return true if the target supports masked load/store /// AVX2 and AVX-512 targets allow masks for consecutive load and store bool isLegalMaskedStore(Type *DataType) const; @@ -1054,6 +1056,7 @@ TargetTransformInfo::LSRCost &C2) = 0; virtual bool canMacroFuseCmp() = 0; virtual bool shouldFavorPostInc() const = 0; + virtual bool shouldFavorCrossIterationPostInc() const = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; @@ -1287,6 +1290,9 @@ bool shouldFavorPostInc() const override { return Impl.shouldFavorPostInc(); } + bool shouldFavorCrossIterationPostInc() const override { + return Impl.shouldFavorCrossIterationPostInc(); + } bool isLegalMaskedStore(Type *DataType) override { return Impl.isLegalMaskedStore(DataType); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -254,6 +254,8 @@ bool shouldFavorPostInc() const { return false; } + bool shouldFavorCrossIterationPostInc() const { return false; } + bool isLegalMaskedStore(Type *DataType) { return false; } bool isLegalMaskedLoad(Type *DataType) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -163,6 +163,10 @@ return TTIImpl->shouldFavorPostInc(); } +bool TargetTransformInfo::shouldFavorCrossIterationPostInc() const { + return TTIImpl->shouldFavorCrossIterationPostInc(); +} + bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const { return TTIImpl->isLegalMaskedStore(DataType); } Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -47,6 +47,7 @@ const ARMSubtarget *ST; const ARMTargetLowering *TLI; + const Function &F; // Currently the following features are excluded from InlineFeatureWhitelist. // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16 @@ -87,13 +88,18 @@ public: explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), - TLI(ST->getTargetLowering()) {} + TLI(ST->getTargetLowering()), F(F) {} bool areInlineCompatible(const Function *Caller, const Function *Callee) const; bool enableInterleavedAccessVectorization() { return true; } + bool shouldFavorCrossIterationPostInc() const { + return !F.optForMinSize() && + ST->isMClass() && ST->isThumb2() && !ST->hasBranchPredictor(); + } + /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// is IEEE-754 compliant, but it's not covered in this target. Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1236,16 +1236,44 @@ } unsigned LoopCost = 1; - if (TTI.shouldFavorPostInc()) { - const SCEV *LoopStep = AR->getStepRecurrence(SE); - if (isa(LoopStep)) { - // Check if a post-indexed load/store can be used. - if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || - TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || + TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + + std::function GetConstantStart = + [&GetConstantStart](const SCEV *S) -> const SCEVConstant* { + if (auto *C = dyn_cast(S)) + return C; + + if (auto *AddRec = dyn_cast(S)) + return GetConstantStart(AddRec->getStart()); + + if (auto *Add = dyn_cast(S)) + return GetConstantStart(Add->getOperand(0)); + + return nullptr; + }; + + if (TTI.shouldFavorCrossIterationPostInc()) { + if (isa(AR->getOperand(1))) { + if (auto *Start = GetConstantStart(AR)) { + const APInt &StartInt = Start->getAPInt(); + const APInt &ARInt = cast(AR->getOperand(1))->getAPInt(); + // We can turn this access into a post increment as the initial offset + // required matches the recurrence. + if ((StartInt.isNegative() && StartInt.abs() == ARInt) || + (ARInt.isNegative() && ARInt.abs() == StartInt)) + LoopCost = 0; + } + } + } + + if (TTI.shouldFavorPostInc()) { + const SCEV *LoopStep = AR->getStepRecurrence(SE); + if (isa(LoopStep)) { const SCEV *LoopStart = AR->getStart(); if (!isa(LoopStart) && - SE.isLoopInvariant(LoopStart, L)) - LoopCost = 0; + SE.isLoopInvariant(LoopStart, L)) + LoopCost = 0; } } } @@ -1262,7 +1290,6 @@ } } ++C.NumRegs; - // Rough heuristic; favor registers which don't require extra setup // instructions in the preheader. if (!isa(Reg) && @@ -1354,8 +1381,9 @@ // specifically not supported. if (LU.Kind == LSRUse::Address && Offset != 0 && !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, - Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) + Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) { C.NumBaseAdds++; + } } // If we don't count instruction cost exit here. @@ -1394,6 +1422,7 @@ // BaseAdds adds instructions for unfolded registers. if (LU.Kind != LSRUse::ICmpZero) C.Insns += C.NumBaseAdds - PrevNumBaseAdds; + assert(isValid() && "invalid cost"); } @@ -3738,8 +3767,8 @@ void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { - const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; - for (int64_t Offset : Worklist) { + + auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { Formula F = Base; F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, @@ -3761,8 +3790,34 @@ (void)InsertFormula(LU, LUIdx, F); } + }; + + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + + // With constant offsets and constant steps, we can generate post index + // accesses by having the offset equal the step. So, for access #0 with a + // step of 8, we could generate a G - 8 base which would require the first + // access to be ((G - 8) + 8),+,8. The post-indexed access would then update + // the pointer for itself in the next iteration. + if (TTI.shouldFavorCrossIterationPostInc() && LU.Kind == LSRUse::Address) { + if (auto *GAddRec = dyn_cast(G)) { + if (auto *StepRec = + dyn_cast(GAddRec->getStepRecurrence(SE))) { + const APInt &StepInt = StepRec->getAPInt(); + int64_t Step = StepInt.isNegative() ? + StepInt.getSExtValue() : StepInt.getZExtValue(); + + for (int64_t Offset : Worklist) { + Offset -= Step; + GenerateOffset(G, Offset); + } + } + } } + for (int64_t Offset : Worklist) + GenerateOffset(G, Offset); + int64_t Imm = ExtractImmediate(G, SE); if (G->isZero() || Imm == 0) return; Index: test/CodeGen/ARM/dsp-post-incs.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/dsp-post-incs.ll @@ -0,0 +1,531 @@ +; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m7 %s -o - | FileCheck %s --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=MINSIZE +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX + +; Tests to check that post increment addressing modes are used instead of +; updating base pointers with add instructions. + +; DISABLED-LABEL: test_qadd_2 +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: test_qadd_2 +; CHECK: sub{{.*}} [[A:r[0-9]+]], r0, #8 +; CHECK: subs [[B:r[0-9]+]], #8 +; CHECK: subs [[OUT:r[0-9]+]], #8 + +; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]! +; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]! +; CHECK: str{{.*}}, {{\[}}[[OUT]], #8]! +; CHECK: ldr{{.*}}, {{\[}}[[B]], #4] +; CHECK: ldr{{.*}}, {{\[}}[[A]], #4] +; CHECK: str{{.*}}, {{\[}}[[OUT]], #4] +; CHECK: blo +define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; DISABLED-LABEL: test_qadd_2_backwards +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: test_qadd_2_backwards + +; CHECK-DEFAULT: [[shift:[rl0-9]+]], r3, #2 +; CHECK-DEFAULT: add{{.*}} [[A:r[0-9]+]], r0, [[shift]], lsl #2 +; CHECK-DEFAULT: add{{.*}} [[B:r[0-9]+]], r1, [[shift]], lsl #2 +; CHECK-DEFAULT: add{{.*}} [[OUT:r[0-9]+]], r2, [[shift]], lsl #2 + +; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[B]], #-8]! +; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[A]], #-8]! +; CHECK-DEFAULT: str{{.*}}, {{\[}}[[OUT]], #-8]! +; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[B]], #-4] +; CHECK-DEFAULT: ldr{{.*}}, {{\[}}[[A]], #-4] +; CHECK-DEFAULT: str{{.*}}, {{\[}}[[OUT]], #-4] + +; FIXME: The higher complexity produces more instructions in the preheader +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: str{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: ldr{{.*}} lsl #2] +; CHECK-COMPLEX: str{{.*}} lsl #2] + +; CHECK: blo + +define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = sub nsw nuw i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = sub nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; DISABLED-LABEL: test_qadd_3 +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: test_qadd_3 +; CHECK: sub{{.*}}, #12 +; CHECK: sub{{.*}}, #12 +; CHECK: subs{{.*}}, #12 + +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #12]! +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #12]! +; CHECK: str{{.*}}, {{\[}}{{.*}}, #12]! +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #4] +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #4] +; CHECK: str{{.*}}, {{\[}}{{.*}}, #4] +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #8] +; CHECK: ldr{{.*}}, {{\[}}{{.*}}, #8] +; CHECK: str{{.*}}, {{\[}}{{.*}}, #8] +; CHECK: blo +define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nuw nsw i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %idx.3 = add nuw nsw i32 %idx.1, 2 + %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3 + %a.3 = load i32, i32* %gep.a.3 + %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3 + %b.3 = load i32, i32* %gep.b.3 + %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3) + %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3 + store i32 %qadd.3, i32* %addr.3 + %i.next = add nsw nuw i32 %i, -3 + %idx.next = add nsw nuw i32 %idx.1, 3 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; DISABLED-LABEL: test_qadd_4 +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: test_qadd_4 +; CHECK-COMPLEX: sub{{.*}}, #16 +; CHECK-COMPLEX: sub{{.*}}, #16 +; CHECK-COMPLEX: sub{{.*}}, #16 + +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #16]! +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #16]! +; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #16]! +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}} #4] +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #4] +; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #4] +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #8] +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #8] +; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #8] +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #12] +; CHECK-COMPLEX: ldr{{.*}}, {{\[}}{{.*}}, #12] +; CHECK-COMPLEX: str{{.*}}, {{\[}}{{.*}}, #12] +; CHECK: blo +define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1 + %a.1 = load i32, i32* %gep.a.1 + %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1 + %b.1 = load i32, i32* %gep.b.1 + %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2 + %a.2 = load i32, i32* %gep.a.2 + %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2 + %b.2 = load i32, i32* %gep.b.2 + %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %idx.3 = or i32 %idx.1, 2 + %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3 + %a.3 = load i32, i32* %gep.a.3 + %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3 + %b.3 = load i32, i32* %gep.b.3 + %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3) + %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3 + store i32 %qadd.3, i32* %addr.3 + %idx.4 = or i32 %idx.1, 3 + %gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4 + %a.4 = load i32, i32* %gep.a.4 + %gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4 + %b.4 = load i32, i32* %gep.b.4 + %qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4) + %addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4 + store i32 %qadd.4, i32* %addr.4 + %i.next = add nsw nuw i32 %i, -4 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; DISABLED-LABEL: test_qadd16_2 +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: test_qadd16_2 +; CHECK: sub.w [[A:r[0-9]+]], r0, #8 +; CHECK: subs [[B:r[0-9]+]], #8 +; CHECK: subs [[OUT:r[0-9]+]], #16 + +; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]! +; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]! +; CHECK: str{{.*}}, {{\[}}[[OUT]], #16]! +; CHECK: ldr{{.*}}, {{\[}}[[B]], #4] +; CHECK: ldr{{.*}}, {{\[}}[[A]], #4] +; CHECK: str{{.*}}, {{\[}}[[OUT]], #8] +define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1 + %cast.a.1 = bitcast i16* %gep.a.1 to i32* + %a.1 = load i32, i32* %cast.a.1 + %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1 + %cast.b.1 = bitcast i16* %gep.b.1 to i32* + %b.1 = load i32, i32* %cast.b.1 + %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nsw nuw i32 %idx.1, 2 + %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2 + %cast.a.2 = bitcast i16* %gep.a.2 to i32* + %a.2 = load i32, i32* %cast.a.2 + %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2 + %cast.b.2 = bitcast i16* %gep.b.2 to i32* + %b.2 = load i32, i32* %cast.b.2 + %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; MINSIZE-LABEL: test_qadd16_2_minsize +; MINSIZE-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; MINSIZE-NOT: str{{.*}}, [{{.*}}, {{.*}}]! +define void @test_qadd16_2_minsize(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) minsize { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1 + %cast.a.1 = bitcast i16* %gep.a.1 to i32* + %a.1 = load i32, i32* %cast.a.1 + %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1 + %cast.b.1 = bitcast i16* %gep.b.1 to i32* + %b.1 = load i32, i32* %cast.b.1 + %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nsw nuw i32 %idx.1, 2 + %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2 + %cast.a.2 = bitcast i16* %gep.a.2 to i32* + %a.2 = load i32, i32* %cast.a.2 + %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2 + %cast.b.2 = bitcast i16* %gep.b.2 to i32* + %b.2 = load i32, i32* %cast.b.2 + %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; MINSIZE-LABEL: test_qadd16_2_size +; MINSIZE: ldr{{.*}}, [{{.*}}, {{.*}}]! +; MINSIZE: str{{.*}}, [{{.*}}, {{.*}}]! +define void @test_qadd16_2_size(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) optsize { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1 + %cast.a.1 = bitcast i16* %gep.a.1 to i32* + %a.1 = load i32, i32* %cast.a.1 + %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1 + %cast.b.1 = bitcast i16* %gep.b.1 to i32* + %b.1 = load i32, i32* %cast.b.1 + %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1) + %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1 + store i32 %qadd.1, i32* %addr.1 + %idx.2 = add nsw nuw i32 %idx.1, 2 + %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2 + %cast.a.2 = bitcast i16* %gep.a.2 to i32* + %a.2 = load i32, i32* %cast.a.2 + %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2 + %cast.b.2 = bitcast i16* %gep.b.2 to i32* + %b.2 = load i32, i32* %cast.b.2 + %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2) + %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2 + store i32 %qadd.2, i32* %addr.2 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 4 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; DISABLED-LABEL: test_fma +; DISABLED-NOT: vldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: vstr{{.*}}, [{{.*}}, {{.*}}]! + +; TODO: I think we should be able to use post inc addressing with VLDM +; instructions. +; CHECK-LABEL: test_fma +; CHECK: subs [[A:r[0-9]+]], #8 +; CHECK: subs [[B:r[0-9]+]], #8 + +; CHECK: vldr s{{.*}}, {{\[}}[[B]], #8] +; CHECK: vldr s{{.*}}, {{\[}}[[A]], #8] +; CHECK: vldr s{{.*}}, {{\[}}[[B]], #12] +; CHECK: vldr s{{.*}}, {{\[}}[[A]], #12] +define float @test_fma(float* %a, float* %b, i32 %N) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] + %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ] + %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1 + %a.1 = load float, float* %gep.a.1 + %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1 + %b.1 = load float, float* %gep.b.1 + %fmul.1 = fmul float %a.1, %b.1 + %fma.1 = fadd float %fmul.1, %res + %idx.2 = or i32 %idx.1, 1 + %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2 + %a.2 = load float, float* %gep.a.2 + %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2 + %b.2 = load float, float* %gep.b.2 + %fmul.2 = fmul float %a.2, %b.2 + %fma.2 = fadd float %fmul.2, %fma.1 + %i.next = add nsw nuw i32 %i, -2 + %idx.next = add nsw nuw i32 %idx.1, 2 + %cmp = icmp ult i32 %i.next, %N + br i1 %cmp, label %loop, label %exit + +exit: + ret float %fma.2 +} + +; DISABLED-LABEL: convolve_16bit +; DISABLED-NOT: ldr{{.*}}, [{{.*}}, {{.*}}]! +; DISABLED-NOT: str{{.*}}, [{{.*}}, {{.*}}]! + +; CHECK-LABEL: convolve_16bit + +; CHECK: ldr.w {{.*}}, [{{.*}}, lsl #2] +; CHECK: ldr.w [[pA:r[rl0-9]+]], [{{.*}}, lsl #2] +; CHECK: ldr.w [[pB:[rl0-9]+]], [{{.*}}, lsl #2] +; CHECK: add{{.*}} [[A:[rl0-9]+]], [[pA]], {{.*}}, lsl #1 +; CHECK: sub{{.*}} [[B:[rl0-9]+]], [[pB]], #8 + +; CHECK: ldr{{.*}}, {{\[}}[[B]], #8]! +; CHECK: ldr{{.*}}, {{\[}}[[A]], #8]! +; CHECK: ldr{{.*}}, {{\[}}[[B]], #4] +; CHECK: ldr{{.*}}, {{\[}}[[A]], #4] +define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, + i32 %filter_dim, i32 %out_width, i32 %out_height, + i32** nocapture readonly %convolved) { +entry: + %cmp92 = icmp eq i32 %out_height, 0 + br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %xtraiter = and i32 %filter_dim, 3 + %unroll_iter = sub i32 %filter_dim, %xtraiter + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph + %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ] + %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093 + %tmp3 = load i32*, i32** %arrayidx22, align 4 + br label %for.cond9.preheader.us.us.preheader + +for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph + %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ] + br label %for.cond9.preheader.us.us + +for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader + %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] + %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] + %add.us.us = add i32 %filter_y.056.us.us, %res_y.093 + %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us + %tmp5 = load i16*, i16** %arrayidx.us.us, align 4 + %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us + %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4 + br label %for.body12.us.us + +for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us + %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] + %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ] + %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ] + %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us + %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us + %tmp9 = load i16, i16* %arrayidx14.us.us, align 2 + %conv.us.us = sext i16 %tmp9 to i32 + %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us + %tmp10 = load i16, i16* %arrayidx16.us.us, align 2 + %conv17.us.us = sext i16 %tmp10 to i32 + %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us + %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us + %inc.us.us = or i32 %filter_x.053.us.us, 1 + %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us + %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us + %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2 + %conv.us.us.1 = sext i16 %tmp11 to i32 + %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1 + %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2 + %conv17.us.us.1 = sext i16 %tmp12 to i32 + %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1 + %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us + %inc.us.us.1 = or i32 %filter_x.053.us.us, 2 + %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us + %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1 + %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2 + %conv.us.us.2 = sext i16 %tmp13 to i32 + %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2 + %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2 + %conv17.us.us.2 = sext i16 %tmp14 to i32 + %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2 + %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1 + %inc.us.us.2 = or i32 %filter_x.053.us.us, 3 + %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us + %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2 + %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2 + %conv.us.us.3 = sext i16 %tmp15 to i32 + %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3 + %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2 + %conv17.us.us.3 = sext i16 %tmp16 to i32 + %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3 + %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2 + %inc.us.us.3 = add i32 %filter_x.053.us.us, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us + +for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us + %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1 + %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim + br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us + +for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us + %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us + store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4 + %add25.us = add nuw i32 %res_x.060.us, 1 + %exitcond99 = icmp eq i32 %add25.us, %out_width + br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader + +for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader + %add28 = add nuw i32 %res_y.093, 1 + %exitcond100 = icmp eq i32 %add28, %out_height + br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void +} + +declare i32 @llvm.arm.qadd(i32, i32) +declare i32 @llvm.arm.qadd16(i32, i32) + Index: test/CodeGen/ARM/loop-align-cortex-m.ll =================================================================== --- test/CodeGen/ARM/loop-align-cortex-m.ll +++ test/CodeGen/ARM/loop-align-cortex-m.ll @@ -1,10 +1,10 @@ ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s -; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s define void @test_loop_alignment(i32* %in, i32* %out) optsize { ; CHECK-LABEL: test_loop_alignment: -; CHECK: movs {{r[0-9]+}}, #0 +; CHECK: mov{{.*}}, #4092 ; CHECK: .p2align 2 entry: Index: test/Transforms/LoopStrengthReduce/ARM/complexity.ll =================================================================== --- test/Transforms/LoopStrengthReduce/ARM/complexity.ll +++ test/Transforms/LoopStrengthReduce/ARM/complexity.ll @@ -1,21 +1,25 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT -; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX +; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s +; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m4 %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s -; CHECK-DEFAULT-LABEL: for.body12.us.us: -; CHECK-DEFAULT: phi i32 -; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] -; CHECK-DEFAULT: phi i32 -; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8 +; CHECK-LABEL: for.cond9.preheader.us.us: +; CHECK: [[SCEVGEP:%[^ ]+]] = getelementptr i16, i16* %tmp5, i32 -4 +; CHECK: [[SCEVGEP9:%[^ ]+]] = getelementptr i16, i16* %tmp6, i32 %lsr.iv -; CHECK-COMPLEX-LABEL: for.body12.us.us: -; CHECK-COMPLEX: phi i32 -; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ] -; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ] -; CHECK-COMPLEX: phi i32 -; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4 -; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4 +; CHECK-LABEL: for.body12.us.us: +; CHECK: [[LSR_IV10:%[^ ]+]] = phi i16* [ [[SCEVGEP11:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP9]], %for.cond9.preheader.us.us ] +; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP]], %for.cond9.preheader.us.us ] +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 4 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 5 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 5 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 6 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 6 +; CHECK: getelementptr i16, i16* [[LSR_IV]], i32 7 +; CHECK: getelementptr i16, i16* [[LSR_IV10]], i32 7 +; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK: [[SCEVGEP11]] = getelementptr i16, i16* [[LSR_IV10]], i32 4 define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) { entry: