Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -376,6 +376,8 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  bool isSplatExpensive() const;
+
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -668,6 +670,7 @@
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
+  virtual bool isSplatExpensive() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                               unsigned BitWidth,
@@ -840,6 +843,8 @@
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool isSplatExpensive() override { return Impl.isSplatExpensive(); }
+
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -251,6 +251,8 @@
 
   bool enableInterleavedAccessVectorization() { return false; }
 
+  bool isSplatExpensive() { return false; }
+
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -182,6 +182,10 @@
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::isSplatExpensive() const {
+  return TTIImpl->isSplatExpensive();
+}
+
 bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,6 +54,8 @@
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool isSplatExpensive() const { return true; }
+
   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
   /// is IEEE-754 compliant, but it's not covered in this target.
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2026,7 +2026,8 @@
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !Legal->isScalarAfterVectorization(EntryVal)) {
+      (!Legal->isScalarAfterVectorization(EntryVal) ||
+       TTI->isSplatExpensive())) {
     createVectorIntInductionPHI(ID, Entry, TruncType);
     VectorizedIV = true;
   }
Index: test/Transforms/LoopVectorize/ARM/dont-splat.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/ARM/dont-splat.ll
@@ -0,0 +1,104 @@
+; RUN: opt -loop-vectorize -dce -instcombine -simplifycfg -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-arm-none-eabi"
+
+;CHECK-LABEL: vector.body:
+;CHECK: phi i32
+;CHECK-NOT: insertelement <16 x i32>
+;CHECK-NOT: shufflevector <16 x i32>
+;CHECK: phi <16 x i32>
+;CHECK: add nuw nsw <16 x i32>
+
+define void @test(i16 zeroext %xMax, i16 zeroext %yMax, i8* noalias nocapture readonly %input, i8* noalias nocapture %output) #0 {
+entry:
+  %conv1 = zext i16 %yMax to i32
+  %sub = add nsw i32 %conv1, -1
+  %cmp99 = icmp sgt i32 %sub, 1
+  br i1 %cmp99, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond3.preheader.lr.ph:                        ; preds = %entry
+  %conv5 = zext i16 %xMax to i32
+  %sub6 = add nsw i32 %conv5, -1
+  %cmp796 = icmp sgt i32 %sub6, 1
+  br label %for.cond3.preheader
+
+for.cond3.preheader:                              ; preds = %for.cond3.preheader.lr.ph, %for.cond.cleanup9
+  %conv101 = phi i32 [ 1, %for.cond3.preheader.lr.ph ], [ %conv, %for.cond.cleanup9 ]
+  %y.0100 = phi i16 [ 1, %for.cond3.preheader.lr.ph ], [ %inc64, %for.cond.cleanup9 ]
+  br i1 %cmp796, label %for.body10.lr.ph, label %for.cond.cleanup9
+
+for.body10.lr.ph:                                 ; preds = %for.cond3.preheader
+  %mul = mul nuw nsw i32 %conv101, %conv5
+  br label %for.body10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup9, %entry
+  ret void
+
+for.cond.cleanup9:                                ; preds = %for.body10, %for.cond3.preheader
+  %inc64 = add i16 %y.0100, 1
+  %conv = zext i16 %inc64 to i32
+  %cmp = icmp slt i32 %conv, %sub
+  br i1 %cmp, label %for.cond3.preheader, label %for.cond.cleanup
+
+for.body10:                                       ; preds = %for.body10.lr.ph, %for.body10
+  %conv498 = phi i32 [ 1, %for.body10.lr.ph ], [ %conv4, %for.body10 ]
+  %x.097 = phi i16 [ 1, %for.body10.lr.ph ], [ %inc, %for.body10 ]
+  %add = add nuw nsw i32 %conv498, %mul
+  %sub15 = sub nsw i32 %add, %conv5
+  %sub16 = add i32 %sub15, -1
+  %arrayidx = getelementptr inbounds i8, i8* %input, i32 %sub16
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv17 = zext i8 %0 to i32
+  %mul18 = mul nuw nsw i32 %conv17, 3
+  %arrayidx21 = getelementptr inbounds i8, i8* %input, i32 %sub15
+  %1 = load i8, i8* %arrayidx21, align 1
+  %conv22 = zext i8 %1 to i32
+  %mul23 = mul nuw nsw i32 %conv22, 5
+  %add24 = add nuw nsw i32 %mul23, %mul18
+  %add27 = add i32 %sub15, 1
+  %arrayidx28 = getelementptr inbounds i8, i8* %input, i32 %add27
+  %2 = load i8, i8* %arrayidx28, align 1
+  %conv29 = zext i8 %2 to i32
+  %mul30 = mul nuw nsw i32 %conv29, 7
+  %add31 = add nuw nsw i32 %add24, %mul30
+  %sub32 = add nsw i32 %add, -1
+  %arrayidx33 = getelementptr inbounds i8, i8* %input, i32 %sub32
+  %3 = load i8, i8* %arrayidx33, align 1
+  %conv34 = zext i8 %3 to i32
+  %mul35 = mul nuw nsw i32 %conv34, 9
+  %add36 = add nuw nsw i32 %add31, %mul35
+  %arrayidx37 = getelementptr inbounds i8, i8* %input, i32 %add
+  %4 = load i8, i8* %arrayidx37, align 1
+  %conv38 = zext i8 %4 to i32
+  %mul39 = mul nuw nsw i32 %conv38, 11
+  %add40 = add nuw nsw i32 %add36, %mul39
+  %add41 = add nuw i32 %add, 1
+  %arrayidx42 = getelementptr inbounds i8, i8* %input, i32 %add41
+  %5 = load i8, i8* %arrayidx42, align 1
+  %conv43 = zext i8 %5 to i32
+  %mul44 = mul nuw nsw i32 %conv43, 13
+  %add45 = add nuw nsw i32 %add40, %mul44
+  %add47 = add nuw i32 %add, %conv5
+  %sub48 = add i32 %add47, -1
+  %arrayidx49 = getelementptr inbounds i8, i8* %input, i32 %sub48
+  %6 = load i8, i8* %arrayidx49, align 1
+  %conv50 = zext i8 %6 to i32
+  %mul51 = mul nuw nsw i32 %conv50, 15
+  %add52 = add nsw i32 %add45, %mul51
+  %arrayidx55 = getelementptr inbounds i8, i8* %input, i32 %add47
+  %7 = load i8, i8* %arrayidx55, align 1
+  %conv56 = zext i8 %7 to i32
+  %mul57 = mul nuw nsw i32 %conv56, 17
+  %add58 = add nsw i32 %add52, %mul57
+  %8 = lshr i32 %add58, 8
+  %conv61 = trunc i32 %8 to i8
+  %arrayidx62 = getelementptr inbounds i8, i8* %output, i32 %add
+  store i8 %conv61, i8* %arrayidx62, align 1
+  %inc = add i16 %x.097, 1
+  %conv4 = zext i16 %inc to i32
+  %cmp7 = icmp slt i32 %conv4, %sub6
+  br i1 %cmp7, label %for.body10, label %for.cond.cleanup9
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }