Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -887,6 +887,12 @@
   /// the factor width.
   unsigned expectedCost(unsigned VF);
 
+  // Return the vectorized type or a clamped type that may have been
+  // previously identified.
+  Type* getClampedVectorTy(Instruction *I, unsigned VF);
+
+  bool isFreeCast(Instruction *I, unsigned VF);
+
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
@@ -921,6 +927,12 @@
   const Function *TheFunction;
   // Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
+
+  // While searching from truncs, we store instructions which can use
+  // smaller types when vectorized.
+  std::map<Instruction*, Type*> ClampedInstrTys;
+  // Sext and Zext instructions that will disappear due to vectorization.
+  std::set<Instruction*> FreeCasts;
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -4045,6 +4057,7 @@
 }
 
 unsigned LoopVectorizationCostModel::getWidestType() {
+
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
@@ -4372,16 +4385,17 @@
     BasicBlock *BB = *bb;
 
     // For each instruction in the old loop.
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    for (BasicBlock::reverse_iterator it = BB->rbegin(), e = BB->rend();
+         it != e; ++it) {
       // Skip dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(it))
+      if (isa<DbgInfoIntrinsic>(*it))
         continue;
 
       // Ignore ephemeral values.
-      if (EphValues.count(it))
+      if (EphValues.count(&*it))
         continue;
 
-      unsigned C = getInstructionCost(it, VF);
+      unsigned C = getInstructionCost(&*it, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
@@ -4401,6 +4415,10 @@
     Cost += BlockCost;
   }
 
+  // Clear the structures for this VF.
+  ClampedInstrTys.clear();
+  FreeCasts.clear();
+
   return Cost;
 }
 
@@ -4463,6 +4481,85 @@
   return false;
 }
 
+bool
+LoopVectorizationCostModel::isFreeCast(Instruction *I, unsigned VF) {
+  CastInst *CI = cast<CastInst>(I);
+  Type *DstTy = CI->getDestTy();
+  Type *DstVecTy = ToVectorTy(DstTy, VF);
+
+  // Search happens from truncs towards the possible extend instructions,
+  // so these would of been found eariler.
+  if (I->getOpcode() != Instruction::Trunc) {
+    if (FreeCasts.find(I) != FreeCasts.end())
+      return true;
+    else
+      return false;
+  }
+
+  if (!DstTy->isIntegerTy())
+    return false;
+
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+  unsigned ScalarWidth = DL.getTypeSizeInBits(DstTy);
+
+  // First, check whether the truncation destination size would be a legal
+  // vector type.
+  if (TTI.isTypeLegal(DstVecTy)) {
+    auto *ChainInst = cast<Instruction>(I->getOperand(0));
+    SmallVector<Instruction*, 4> PossibleFreeCasts;
+    unsigned NarrowOperands = 0;
+
+    // Then search the operands of the trunc use and look for sext/zext or
+    // constants within the size limit.
+    for (unsigned i = 0; i < ChainInst->getNumOperands(); ++i) {
+
+      if (isa<ConstantInt>(ChainInst->getOperand(i))) {
+        auto *ConstInt = cast<ConstantInt>(ChainInst->getOperand(i));
+
+        if (ConstInt->uge((1 << ScalarWidth) - 1))
+          break;
+
+          ++NarrowOperands;
+
+        } else if (isa<CastInst>(ChainInst->getOperand(i))) {
+            auto *Cast = cast<CastInst>(ChainInst->getOperand(i));
+
+            if (!Cast->isIntegerCast())
+              break;
+
+            ++NarrowOperands;
+
+            // Check whether the original size of the sext/zext is the same
+            // as the trunc destination size.
+            if (Cast->getSrcTy() == DstTy)
+              PossibleFreeCasts.push_back(Cast);
+            else
+              // If not, we still save the clamped size so later the
+              // zext/sext instruction knows what size is necessary.
+              ClampedInstrTys.insert(std::make_pair(Cast, DstVecTy));
+        }
+    }
+    // If all the operands to this instruction are narrow, make the trunc
+    // and the casts free and assign a clamped VectorTy to the ChainInst.
+    if (NarrowOperands == ChainInst->getNumOperands()) {
+      for (auto *I : PossibleFreeCasts)
+        FreeCasts.insert(I);
+      ClampedInstrTys.insert(std::make_pair(ChainInst, DstVecTy));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Type*
+LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) {
+  if (ClampedInstrTys.find(I) != ClampedInstrTys.end())
+    return ClampedInstrTys[I];
+  else
+    return ToVectorTy(I->getType(), VF);
+}
+
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
@@ -4471,7 +4568,7 @@
     VF = 1;
 
   Type *RetTy = I->getType();
-  Type *VectorTy = ToVectorTy(RetTy, VF);
+  Type *VectorTy = getClampedVectorTy(I, VF);
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -4630,15 +4727,19 @@
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
+    Type *SrcTy = I->getOperand(0)->getType();
+    Type *SrcVecTy = ToVectorTy(SrcTy, VF);
+    unsigned Opcode = I->getOpcode();
     // We optimize the truncation of induction variable.
     // The cost of these is the same as the scalar operation.
-    if (I->getOpcode() == Instruction::Trunc &&
+    if (Opcode == Instruction::Trunc &&
         Legal->isInductionVariable(I->getOperand(0)))
-      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
-                                  I->getOperand(0)->getType());
+      return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy);
 
-    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    if (isFreeCast(I, VF))
+      return 0;
+    else
+      return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -0,0 +1,147 @@
+; RUN: opt -S < %s -loop-vectorize -simplifycfg -instsimplify -instcombine  2>&1 | FileCheck %s
+
+; ModuleID = 'vector-test.c'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: @add_a(
+; COST: cost of 2 for instruction: {{.*}} load <16 x i8>
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: load <16 x i8>, <16 x i8>*
+; COST: cost of 1 for instruction: {{.*}} add <16 x i8>
+; CHECK: add <16 x i8>
+; CHECK: add <16 x i8>
+; COST: cost of 2 for instruction: {{.*}} store <16 x i8>
+; CHECK: store <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1, !tbaa !1
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i8
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv1, i8* %arrayidx3, align 1, !tbaa !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_b(
+; COST: cost of 2 for instruction: {{.*}} load <16 x i8>
+; CHECK: load <8 x i16>, <8 x i16>*
+; CHECK: load <8 x i16>, <8 x i16>*
+; COST: cost of 1 for instruction: {{.*}} add <16 x i8>
+; CHECK: add <8 x i16>
+; CHECK: add <8 x i16>
+; COST: cost of 2 for instruction: {{.*}} store <16 x i8>
+; CHECK: store <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %len, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 2, !tbaa !4
+  %conv8 = zext i16 %0 to i32
+  %add = add nuw nsw i32 %conv8, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3, align 2, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_c(
+; CHECK: load <8 x i8>, <8 x i8>*
+; CHECK: add nuw nsw <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1, !tbaa !1
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3, align 2, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nounwind
+; CHECK-LABEL: @add_d(
+; COST: cost of 2 for instruction: {{.*}} load <4 x i16>
+; CHECK: load <4 x i16>
+; CHECK: load <4 x i16>
+; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; COST: cost of 2 for instruction: {{.*}} store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 2, !tbaa !4
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4, !tbaa !6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.7.0 (http://llvm.org/git/clang.git 93281cbcbb9b1ea2b6788a629d6aef3284957d05) (http://llvm.org/git/llvm.git 40048e70d7a63ab64df3d0e52107f3c0e3472571)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"short", !2, i64 0}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"int", !2, i64 0}