Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -376,6 +376,8 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + bool isSplatExpensive() const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -668,6 +670,7 @@ virtual bool shouldBuildLookupTables() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool isSplatExpensive() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -840,6 +843,8 @@ bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool isSplatExpensive() override { return Impl.isSplatExpensive(); } + bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -251,6 +251,8 @@ bool enableInterleavedAccessVectorization() { return false; } + bool isSplatExpensive() { return false; } + bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -182,6 +182,10 @@ return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::isSplatExpensive() const { + return TTIImpl->isSplatExpensive(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -54,6 +54,8 @@ bool enableInterleavedAccessVectorization() { return true; } + bool isSplatExpensive() const { return true; } + /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// is IEEE-754 compliant, but it's not covered in this target. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2026,7 +2026,8 @@ // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && IV->getType() == Induction->getType() && Step && - !Legal->isScalarAfterVectorization(EntryVal)) { + (!Legal->isScalarAfterVectorization(EntryVal) || + TTI->isSplatExpensive())) { createVectorIntInductionPHI(ID, Entry, TruncType); VectorizedIV = true; } Index: test/Transforms/LoopVectorize/ARM/dont-splat.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/ARM/dont-splat.ll @@ -0,0 +1,104 @@ +; RUN: opt -loop-vectorize -dce -instcombine -simplifycfg -S < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7-arm-none-eabi" + +;CHECK-LABEL: vector.body: +;CHECK: phi i32 +;CHECK-NOT: insertelement <16 x i32> +;CHECK-NOT: shufflevector <16 x i32> +;CHECK: phi <16 x i32> +;CHECK: add nuw nsw <16 x i32> + +define void @test(i16 zeroext %xMax, i16 zeroext %yMax, i8* noalias nocapture readonly %input, i8* noalias nocapture %output) #0 { +entry: + %conv1 = zext i16 %yMax to i32 + %sub = add nsw i32 %conv1, -1 + %cmp99 = icmp sgt i32 %sub, 1 + br i1 %cmp99, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup + +for.cond3.preheader.lr.ph: ; preds = %entry + %conv5 = zext i16 %xMax to i32 + %sub6 = add nsw i32 %conv5, -1 + %cmp796 = icmp sgt i32 %sub6, 1 + br label %for.cond3.preheader + +for.cond3.preheader: ; preds = %for.cond3.preheader.lr.ph, %for.cond.cleanup9 + %conv101 = phi i32 [ 1, %for.cond3.preheader.lr.ph ], [ %conv, %for.cond.cleanup9 ] + %y.0100 = phi i16 [ 1, %for.cond3.preheader.lr.ph ], [ %inc64, %for.cond.cleanup9 ] + br i1 %cmp796, label %for.body10.lr.ph, label %for.cond.cleanup9 + +for.body10.lr.ph: ; preds = %for.cond3.preheader + %mul = mul nuw nsw i32 %conv101, %conv5 + br label %for.body10 + +for.cond.cleanup: ; preds = %for.cond.cleanup9, %entry + ret void + +for.cond.cleanup9: ; preds = %for.body10, %for.cond3.preheader + %inc64 = add i16 %y.0100, 1 + %conv = zext i16 %inc64 to i32 + %cmp = icmp slt i32 %conv, %sub + br i1 %cmp, label %for.cond3.preheader, label %for.cond.cleanup + +for.body10: ; preds = %for.body10.lr.ph, %for.body10 + %conv498 = phi i32 [ 1, %for.body10.lr.ph ], [ %conv4, %for.body10 ] + %x.097 = phi i16 [ 1, %for.body10.lr.ph ], [ %inc, %for.body10 ] + %add = add nuw nsw i32 %conv498, %mul + %sub15 = sub nsw i32 %add, %conv5 + %sub16 = add i32 %sub15, -1 + %arrayidx = getelementptr inbounds i8, i8* %input, i32 %sub16 + %0 = load i8, i8* %arrayidx, align 1 + %conv17 = zext i8 %0 to i32 + %mul18 = mul nuw nsw i32 %conv17, 3 + %arrayidx21 = getelementptr inbounds i8, i8* %input, i32 %sub15 + %1 = load i8, i8* %arrayidx21, align 1 + %conv22 = zext i8 %1 to i32 + %mul23 = mul nuw nsw i32 %conv22, 5 + %add24 = add nuw nsw i32 %mul23, %mul18 + %add27 = add i32 %sub15, 1 + %arrayidx28 = getelementptr inbounds i8, i8* %input, i32 %add27 + %2 = load i8, i8* %arrayidx28, align 1 + %conv29 = zext i8 %2 to i32 + %mul30 = mul nuw nsw i32 %conv29, 7 + %add31 = add nuw nsw i32 %add24, %mul30 + %sub32 = add nsw i32 %add, -1 + %arrayidx33 = getelementptr inbounds i8, i8* %input, i32 %sub32 + %3 = load i8, i8* %arrayidx33, align 1 + %conv34 = zext i8 %3 to i32 + %mul35 = mul nuw nsw i32 %conv34, 9 + %add36 = add nuw nsw i32 %add31, %mul35 + %arrayidx37 = getelementptr inbounds i8, i8* %input, i32 %add + %4 = load i8, i8* %arrayidx37, align 1 + %conv38 = zext i8 %4 to i32 + %mul39 = mul nuw nsw i32 %conv38, 11 + %add40 = add nuw nsw i32 %add36, %mul39 + %add41 = add nuw i32 %add, 1 + %arrayidx42 = getelementptr inbounds i8, i8* %input, i32 %add41 + %5 = load i8, i8* %arrayidx42, align 1 + %conv43 = zext i8 %5 to i32 + %mul44 = mul nuw nsw i32 %conv43, 13 + %add45 = add nuw nsw i32 %add40, %mul44 + %add47 = add nuw i32 %add, %conv5 + %sub48 = add i32 %add47, -1 + %arrayidx49 = getelementptr inbounds i8, i8* %input, i32 %sub48 + %6 = load i8, i8* %arrayidx49, align 1 + %conv50 = zext i8 %6 to i32 + %mul51 = mul nuw nsw i32 %conv50, 15 + %add52 = add nsw i32 %add45, %mul51 + %arrayidx55 = getelementptr inbounds i8, i8* %input, i32 %add47 + %7 = load i8, i8* %arrayidx55, align 1 + %conv56 = zext i8 %7 to i32 + %mul57 = mul nuw nsw i32 %conv56, 17 + %add58 = add nsw i32 %add52, %mul57 + %8 = lshr i32 %add58, 8 + %conv61 = trunc i32 %8 to i8 + %arrayidx62 = getelementptr inbounds i8, i8* %output, i32 %add + store i8 %conv61, i8* %arrayidx62, align 1 + %inc = add i16 %x.097, 1 + %conv4 = zext i16 %inc to i32 + %cmp7 = icmp slt i32 %conv4, %sub6 + br i1 %cmp7, label %for.body10, label %for.cond.cleanup9 +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }