Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -263,12 +263,13 @@
 public:
   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                       DominatorTree *DT, const TargetLibraryInfo *TLI,
-                      const TargetTransformInfo *TTI, unsigned VecWidth,
+                      const TargetTransformInfo *TTI,
+                      LoopVectorizationCostModel *CostModel, unsigned VecWidth,
                       unsigned UnrollFactor)
       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        Legal(nullptr), AddedSafetyChecks(false) {}
+        Legal(nullptr), AddedSafetyChecks(false), CM(CostModel) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *L) {
@@ -328,7 +329,8 @@
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
   /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
+  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV,
+                            LoopVectorizationCostModel *CM, unsigned VF);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
@@ -472,14 +474,19 @@
 
   // Record whether runtime check is added.
   bool AddedSafetyChecks;
+
+  LoopVectorizationCostModel *CM;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                     DominatorTree *DT, const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+                    const TargetTransformInfo *TTI,
+                    LoopVectorizationCostModel *CostModel,
+                    unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, CostModel, 1,
+                            UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -1136,6 +1143,11 @@
   /// \return  information about the register usage of the loop.
   RegisterUsage calculateRegisterUsage();
 
+  /// \return A map of instructions that can be used with smaller types.
+  std::map<Instruction*, Type*>& getNarrowInstrs(unsigned VF) {
+    return NarrowInstrs[VF];
+  }
+
 private:
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
@@ -1143,6 +1155,34 @@
   /// the factor width.
   unsigned expectedCost(unsigned VF);
 
+  /// Return the vectorized type or a clamped type that may have been
+  /// previously identified.
+  Type* getClampedVectorTy(Instruction *I, unsigned VF);
+
+  /// Expects a CastInst, returns the destination type that may have been
+  /// calculated to be smaller than before. The cast is free if the function
+  /// returns a nullptr.
+  Type* getAdjustedCastType(Instruction *I, unsigned VF);
+
+  /// Create a pair from I and NarrowTy and insert it into the NarrowInstrs
+  /// map for VF. If a value already exists, the larger NarrowTy remains or is
+  /// inserted in its place.
+  void InsertConfirmedNarrow(Instruction *I, Type *NarrowTy, unsigned VF);
+
+  /// Create a pair from I and NarrowTy and insert it into the CandidateNarrowInstrs
+  /// map for VF. If a value already exists, the larger NarrowTy remains or is
+  /// inserted in its place. Returns false if VectorType::get(NarrowTy, VF) is
+  /// not a legal type.
+  bool TryInsertCandidateNarrow(Instruction *I, Type *NarrowTy, unsigned VF);
+
+  /// Adds I to a map with a smaller type if it is all that it needs. This is
+  /// later confirmed.
+  void MapNarrowInstruction(Instruction *I, unsigned VF);
+
+  /// Use the map of CandidateNarrowInstrs to confirm the usability of smaller types
+  /// for chains of instructions.
+  bool ConfirmNarrowChain(Instruction *I, unsigned VF, Type *NarrowTy);
+
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
@@ -1177,6 +1217,11 @@
   const Function *TheFunction;
   // Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
+
+  // While searching from truncs, we store instructions which can use
+  // smaller types when vectorized.
+  std::map<unsigned, std::map<Instruction*, Type*>> NarrowInstrs;
+  std::map<unsigned, std::map<Instruction*, Type*>> CandidateNarrowInstrs;
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1671,11 +1716,11 @@
                              Twine("interleaved by " + Twine(IC) +
                                    " (vectorization not beneficial)"));
 
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
+      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, &CM, IC);
       Unroller.vectorize(&LVL);
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, &CM, VF.Width, IC);
       LB.vectorize(&LVL);
       ++LoopsVectorized;
 
@@ -2339,7 +2384,7 @@
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
   Value *UndefVec = IsVoidRetTy ? nullptr :
-    UndefValue::get(VectorType::get(Instr->getType(), VF));
+  UndefValue::get(VectorType::get(Instr->getType(), VF));
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
@@ -3048,8 +3093,9 @@
 
   // Vectorize all of the blocks in the original loop.
   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
-       be = DFS.endRPO(); bb != be; ++bb)
-    vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
+       be = DFS.endRPO(); bb != be; ++bb) {
+    vectorizeBlockInLoop(*bb, &RdxPHIsToFix, CM, VF);
+  }
 
   // At this point every instruction in the original loop is widened to
   // a vector form. We are almost done. Now, we need to fix the PHI nodes
@@ -3455,7 +3501,10 @@
   }
 }
 
-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
+void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV,
+                                               LoopVectorizationCostModel *CM,
+                                               unsigned VF) {
+  auto &ClampedVecTys = CM->getNarrowInstrs(VF);
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     VectorParts &Entry = WidenMap.get(it);
@@ -3496,11 +3545,32 @@
 
       // Use this vector value for all users of the original instruction.
       for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+        Value *APart = A[Part];
+        Value *BPart = B[Part];
+
+        // The CostModel may have identified operations that could be executed
+        // using smaller types, so convert those operations to use them. We do
+        // this by truncating the operands to the binary operations that are
+        // part of a chain of operations that can use smaller types.
+        if (ClampedVecTys.count(it)) {
+          assert(it->getType()->isIntegerTy() &&
+                 "Should not be clamping floats!");
+
+          Type *Ty = ClampedVecTys[it]->getScalarType();
+          VectorType *VecTy = VectorType::get(Ty, VF);
+          APart = Builder.CreateTrunc(APart, VecTy);
+          BPart = Builder.CreateTrunc(BPart, VecTy);
+        }
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), APart, BPart);
 
         if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
           VecOp->copyIRFlags(BinOp);
 
+        // If the instruction has had its inputs clamped, we need to then
+        // extend the value back to its original size.
+        if (ClampedVecTys.count(it))
+          V = Builder.CreateZExt(V, VectorType::get(it->getType(), VF));
+
         Entry[Part] = V;
       }
 
@@ -4888,23 +4958,41 @@
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   unsigned Cost = 0;
 
+  NarrowInstrs.insert(std::make_pair(VF, std::map<Instruction*, Type*>()));
+
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
     unsigned BlockCost = 0;
     BasicBlock *BB = *bb;
 
-    // For each instruction in the old loop.
+    // For each instruction in the old loop, scan across them to identify
+    // instructions that could use more narrow types.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       // Skip dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(it))
+      if (isa<DbgInfoIntrinsic>(*it))
         continue;
 
       // Ignore ephemeral values.
-      if (EphValues.count(it))
+      if (EphValues.count(&*it))
         continue;
 
-      unsigned C = getInstructionCost(it, VF);
+      MapNarrowInstruction(it, VF);
+    }
+
+    // For each instruction in the old loop. Iterate in reverse order so that
+    // so that narrow typed instructions chains can be confirmed from searching
+    // bottom-up from trunc instructions.
+    for (auto it = BB->rbegin(), e = BB->rend(); it != e; ++it) {
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(*it))
+        continue;
+
+      // Ignore ephemeral values.
+      if (EphValues.count(&*it))
+        continue;
+
+      unsigned C = getInstructionCost(&*it, VF);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
@@ -4986,6 +5074,246 @@
   return false;
 }
 
+// The search for free casts and smaller types begins from truncs towards the
+// possible extend instructions and during the search, the extend instructions
+// are saved.
+Type* LoopVectorizationCostModel::getAdjustedCastType(Instruction *I,
+                                                      unsigned VF) {
+  if (NarrowInstrs[VF].count(I)) {
+    Type *NarrowTy = nullptr;
+    if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      if (TruncInst *TI = dyn_cast<TruncInst>(I))
+        NarrowTy = TI->getDestTy();
+      else
+        NarrowTy = CI->getSrcTy();
+
+      if (NarrowTy == NarrowInstrs[VF][I]) {
+        DEBUG(dbgs() << "LV: This cast is free: " << CI->getName() << "\n");
+        return nullptr;
+      }
+      else
+        return NarrowInstrs[VF][I];
+    }
+  }
+  return I->getType();
+}
+
+Type *getLargestType(Type *T0, Type *T1) {
+  if (T0->isIntegerTy()) {
+    if (T1->isIntegerTy())
+      return T0->getIntegerBitWidth() > T1->getIntegerBitWidth() ? T0 : T1;
+    else
+      return T0;
+  } else if (T1->isIntegerTy())
+    return T1;
+
+  llvm_unreachable("one type should be integer!");
+  return T0;
+}
+
+bool LoopVectorizationCostModel::TryInsertCandidateNarrow(Instruction *I,
+                                                   Type *NarrowTy,
+                                                   unsigned VF) {
+  // We do not allow illegal types to be inserted.
+  if (TTI.isTypeLegal(VectorType::get(NarrowTy, VF))) {
+    if (CandidateNarrowInstrs[VF].count(I)) {
+      Type *Largest = getLargestType(CandidateNarrowInstrs[VF][I], NarrowTy);
+      if (Largest == NarrowTy) {
+        CandidateNarrowInstrs[VF].erase(I);
+        CandidateNarrowInstrs[VF].insert(std::make_pair(I, NarrowTy));
+      }
+    } else
+      CandidateNarrowInstrs[VF].insert(std::make_pair(I, NarrowTy));
+    return true;
+  }
+  return false;
+}
+
+void LoopVectorizationCostModel::InsertConfirmedNarrow(Instruction *I, Type *NarrowTy,
+                                              unsigned VF) {
+  if (NarrowInstrs[VF].count(I)) {
+    Type *Largest = getLargestType(NarrowInstrs[VF][I], NarrowTy);
+    if (Largest == NarrowTy) {
+      NarrowInstrs[VF].erase(I);
+      NarrowInstrs[VF].insert(std::make_pair(I, NarrowTy));
+    }
+  } else
+    NarrowInstrs[VF].insert(std::make_pair(I, NarrowTy));
+}
+
+Type*
+LoopVectorizationCostModel::getClampedVectorTy(Instruction *I, unsigned VF) {
+  if (NarrowInstrs[VF].count(I))
+    return ToVectorTy(NarrowInstrs[VF][I], VF);
+  else
+    return ToVectorTy(I->getType(), VF);
+}
+
+bool LoopVectorizationCostModel::ConfirmNarrowChain(Instruction *I,
+                                                    unsigned VF,
+                                                    Type *NarrowTy) {
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  // If a narrow type has already been suggested for this instruction, update
+  // NarrowTy to be the larger of these types.
+  if (CandidateNarrowInstrs[VF].count(I))
+    NarrowTy = getLargestType(NarrowTy, CandidateNarrowInstrs[VF][I]);
+
+  // Extend instructions now use NarrowTy, this would have either been deduced
+  // in the first top-down phase, or it has now be recalculated by the value
+  // required by its single user.
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    if (!isa<TruncInst>(CI)) {
+      InsertConfirmedNarrow(CI, NarrowTy, VF);
+      return true;
+    }
+    return false;
+  }
+
+  // Right shifts can be allowed if the size reduction does not prevent the
+  // necessary high bits from being calculated. This is true when the shift
+  // value is less or equal to the suggested narrow type - this narrow type
+  // is then doubled in size so not to lose the calculated high bits.
+  if (I->getOpcode() == Instruction::LShr ||
+      I->getOpcode() == Instruction::AShr) {
+    unsigned ShiftVal = cast<ConstantInt>(I->getOperand(1))->getZExtValue();
+    if (ShiftVal <= 8 && NarrowTy == Type::getInt8Ty(Context))
+      NarrowTy = Type::getInt16Ty(Context);
+    else if (ShiftVal <= 16 && I->getType() == Type::getInt64Ty(Context) &&
+             NarrowTy == Type::getInt16Ty(Context))
+      NarrowTy = Type::getInt32Ty(Context);
+    else
+      return false;
+  }
+
+  // To confirm instruction I:
+  // For all the operands, Opr
+  // - if its confirmed, continue
+  // - if its TBC, perform confirmation of Opr
+  // - if its a cast, the type may be adjusted and needs to be saved
+  // - if its a constant - continue
+  // - if not any of the above, this isn't a narrow inst.
+  unsigned NumConfirmed = 0;
+  for (Value *Opr : I->operands()) {
+    if (Instruction *NextOp = dyn_cast<Instruction>(Opr)) {
+      // If we find an already confirmed operand, grab its value.
+      if (NarrowInstrs[VF].count(NextOp))
+        ++NumConfirmed;
+      else if (CandidateNarrowInstrs[VF].count(NextOp)) {
+        // Need to confirm the type of this operand
+        if (ConfirmNarrowChain(NextOp, VF, NarrowTy))
+          ++NumConfirmed;
+        else
+          break;
+      } else if (isa<CastInst>(NextOp)) {
+        // Visit any new CastInsts using this narrow type.
+        if (ConfirmNarrowChain(NextOp, VF, NarrowTy))
+          ++NumConfirmed;
+      }
+    } else if (isa<ConstantInt>(Opr))
+      // We do not have to check as this has previously done, NarrowTy may
+      // have changed, but it only grows.
+      ++NumConfirmed;
+  }
+
+  if (I->getNumOperands() != NumConfirmed) {
+    CandidateNarrowInstrs[VF].erase(I);
+    return false;
+  } else {
+    // If a value already exists for I, the larger type will be kept
+    InsertConfirmedNarrow(I, NarrowTy, VF);
+    return true;
+  }
+}
+
+void LoopVectorizationCostModel::MapNarrowInstruction(Instruction *I,
+                                                      unsigned VF) {
+  if (isa<PHINode>(I))
+    return;
+
+  // We only care about integer operations.
+  Type *DstTy = I->getType();
+  if (!DstTy->isIntegerTy())
+    return;
+
+  // The following opcodes have been selected as if the same values will
+  // be calculated for them even with truncated types. They do not require
+  // the high bits that will be ultimately removed at the finally truncation.
+  // This is not true for the shift right operations, but these can be included
+  // if the shift value is an immediate equal or less than half of the bitwidth
+  // of the resulting type. The extend and trunc ops are also included as they
+  // bound the chains of operations.
+  unsigned Opc = I->getOpcode();
+  if (Opc != Instruction::Mul &&
+      Opc != Instruction::Add &&
+      Opc != Instruction::Sub &&
+      Opc != Instruction::And &&
+      Opc != Instruction::Or &&
+      Opc != Instruction::Xor &&
+      Opc != Instruction::Shl &&
+      Opc != Instruction::LShr &&
+      Opc != Instruction::AShr)
+    return;
+
+  // We can later analyse and validate shift right operations only with an
+  // immediate shift value.
+  if (Opc == Instruction::LShr ||
+      Opc == Instruction::AShr)
+    if (!isa<ConstantInt>(I->getOperand(1)))
+      return;
+
+  const APInt i8MaxValue = APInt::getMaxValue(8);
+  const APInt i16MaxValue = APInt::getMaxValue(16);
+  const APInt i32MaxValue = APInt::getMaxValue(32);
+  SmallVector<Type*, 2> NarrowOpTys;
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+
+  // Search the operands of the instruction and look for operands that have
+  // already been added, constants within the size limit or sext/zext.
+  for (Value *Opr : I->operands()) {
+    if (Instruction *NextOp = dyn_cast<Instruction>(Opr)) {
+      // If the operand is an instruction, it needs to have already been added
+      // to CandidateNarrowInstrs or a CastInst that may not be in the loop body.
+      if (CandidateNarrowInstrs[VF].count(NextOp))
+        NarrowOpTys.push_back(CandidateNarrowInstrs[VF][NextOp]);
+      else if (auto *CI = dyn_cast<CastInst>(NextOp)) {
+        if (auto *TI = dyn_cast<TruncInst>(I)) {
+          if (TryInsertCandidateNarrow(TI, TI->getDestTy(), VF))
+            NarrowOpTys.push_back(TI->getDestTy());
+        } else if (TryInsertCandidateNarrow(CI, CI->getSrcTy(), VF))
+            NarrowOpTys.push_back(CI->getSrcTy());
+      } else
+        return;
+    } else if (auto *ConstInt = dyn_cast<ConstantInt>(Opr)) {
+      // If the operand is a constant, calculate the smallest type that it
+      // can be. Isn't counted as a narrow operand if its larger than 16-bits,
+      // unless the original value is an i64.
+      unsigned BitWidth = ConstInt->getValue().getBitWidth();
+      const APInt ConstVal = ConstInt->getValue();
+      APInt MaskValue = ConstVal & (i8MaxValue.zextOrSelf(BitWidth));
+      if (ConstVal.eq(MaskValue))
+        NarrowOpTys.push_back(Type::getInt8Ty(Context));
+      else if (ConstVal.eq(ConstVal & (i16MaxValue.zextOrSelf(BitWidth))))
+        NarrowOpTys.push_back(Type::getInt16Ty(Context));
+      else if (I->getType() == Type::getInt64Ty(Context) &&
+               ConstVal.eq(ConstVal & (i32MaxValue.zextOrSelf(BitWidth))))
+        NarrowOpTys.push_back(Type::getInt32Ty(Context));
+      else
+        return;
+    }
+  }
+  if (NarrowOpTys.size() == I->getNumOperands()) {
+    // If narrow operands are found, we need to map the new narrow type. This is
+    // either the larger type of the two operands if their differ in size.
+    if (NarrowOpTys[0] == NarrowOpTys[1]) {
+      TryInsertCandidateNarrow(I, NarrowOpTys[0], VF);
+    }
+    else {
+      Type *LargerType = getLargestType(NarrowOpTys[0], NarrowOpTys[1]);
+      TryInsertCandidateNarrow(I, LargerType, VF);
+    }
+  }
+}
+
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
@@ -4994,7 +5322,7 @@
     VF = 1;
 
   Type *RetTy = I->getType();
-  Type *VectorTy = ToVectorTy(RetTy, VF);
+  Type *VectorTy = getClampedVectorTy(I, VF);
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -5193,15 +5521,38 @@
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
+    Type *SrcTy = I->getOperand(0)->getType();
+    Type *SrcVecTy = ToVectorTy(SrcTy, VF);
+    unsigned Opcode = I->getOpcode();
     // We optimize the truncation of induction variable.
     // The cost of these is the same as the scalar operation.
-    if (I->getOpcode() == Instruction::Trunc &&
+    if (Opcode == Instruction::Trunc &&
         Legal->isInductionVariable(I->getOperand(0)))
-      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
-                                  I->getOperand(0)->getType());
+      return TTI.getCastInstrCost(Opcode, I->getType(), SrcTy);
+    else if (Opcode == Instruction::Trunc) {
+      // First, check whether the truncation destination size would be a legal
+      // vector type.
+      if (TTI.isTypeLegal(VectorTy)) {
+        Instruction *ChainOp = cast<Instruction>(I->getOperand(0));
+        if (ConfirmNarrowChain(ChainOp, VF, RetTy)) {
+          DEBUG(dbgs() << "LV: Found a chain of narrow instructions\n");
+          Type *NarrowTy = NarrowInstrs[VF][ChainOp];
+          InsertConfirmedNarrow(I, NarrowTy, VF);
+        }
+      }
+    }
+    // If AdjustedType is nullptr, it is a free cast, otherwise use the adjusted
+    // type for either the source or destination.
+    Type *AdjustedType = getAdjustedCastType(I, VF);
+    if (AdjustedType == nullptr)
+      return 0;
+    else if (Instruction::Trunc == Opcode)
+      SrcTy = ToVectorTy(AdjustedType, VF);
+    else if (Instruction::ZExt == Opcode ||
+             Instruction::SExt == Opcode)
+      VectorTy = ToVectorTy(AdjustedType, VF);
 
-    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -5312,7 +5663,7 @@
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
   Value *UndefVec = IsVoidRetTy ? nullptr :
-  UndefValue::get(Instr->getType());
+    UndefValue::get(VectorType::get(Instr->getType(), VF));
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -0,0 +1,298 @@
+; RUN: opt -S < %s -basicaa -loop-vectorize -simplifycfg -instsimplify -instcombine -licm  2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: @add_a(
+; COST: cost of 2 {{.*}} load <16 x i8>
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: load <16 x i8>, <16 x i8>*
+; COST: cost of 1 for instruction: {{.*}} add <16 x i8>
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: add nuw nsw <16 x i8>
+; COST: cost of 2 for instruction: {{.*}} store <16 x i8>
+; CHECK: store <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i8
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv1, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_b(
+; COST: cost of 2 for instruction: {{.*}} load <16 x i8>
+; CHECK: load <8 x i16>, <8 x i16>*
+; CHECK: load <8 x i16>, <8 x i16>*
+; COST: cost of 1 for instruction: {{.*}} add <16 x i8>
+; CHECK: add nuw nsw <8 x i16>
+; CHECK: add nuw nsw <8 x i16>
+; COST: cost of 2 for instruction: {{.*}} store <16 x i8>
+; CHECK: store <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %len, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv8 = zext i16 %0 to i32
+  %add = add nuw nsw i32 %conv8, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_c(
+; CHECK: load <8 x i8>, <8 x i8>*
+; CHECK: add nuw nsw <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_d(
+; COST: cost of 2 for instruction: {{.*}} load <4 x i16>
+; CHECK: load <4 x i16>
+; CHECK: load <4 x i16>
+; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; COST: cost of 2 for instruction: {{.*}} store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_e(
+; COST: cost of 2 for instruction: {{.*}} load <16 x i8>
+; CHECK: load <16 x i8>
+; CHECK: load <16 x i8>
+; COST: cost of 2 for instruction: {{.*}} shl <16 x i8>
+; CHECK: shl nuw nsw <16 x i8>
+; CHECK: shl nuw nsw <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} add <16 x i8>
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: add nuw nsw <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} or <16 x i8>
+; CHECK: or <16 x i8>
+; CHECK: or <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} mul <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} and <16 x i8>
+; CHECK: and <16 x i8>
+; CHECK: and <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} xor <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: xor <16 x i8>
+; COST: cost of 1 for instruction: {{.*}} mul <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; COST: cost of 2 for instruction: {{.*}} store <16 x i8>
+; CHECK: store <16 x i8>
+; CHECK: store <16 x i8>
+define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+  %cmp.32 = icmp sgt i32 %len, 0
+  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv11 = zext i8 %arg2 to i32
+  %conv13 = zext i8 %arg1 to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = shl nuw nsw i32 %conv, 4
+  %conv2 = add nuw nsw i32 %add, 32
+  %or = or i32 %conv, 51
+  %mul = mul nuw nsw i32 %or, 60
+  %and = and i32 %conv2, %conv13
+  %mul.masked = and i32 %mul, 252
+  %conv17 = xor i32 %mul.masked, %conv11
+  %mul18 = mul nuw nsw i32 %conv17, %and
+  %conv19 = trunc i32 %mul18 to i8
+  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv19, i8* %arrayidx21
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; COST: cost of 2 for instruction: {{.*}} load <8 x i16>
+; CHECK: load <8 x i16>
+; COST: cost of 2 for instruction: {{.*}} shl <8 x i16>
+; CHECK: shl nsw <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} add <8 x i16>
+; CHECK: add nsw <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} and <8 x i16>
+; CHECK: and <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} or <8 x i16>
+; CHECK: or <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} mul <8 x i16>
+; CHECK: mul nuw nsw <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} and <8 x i16>
+; CHECK: and <8 x i16>
+; CHECK: and <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} xor <8 x i16>
+; CHECK: xor <8 x i16>
+; COST: cost of 1 for instruction: {{.*}} mul <8 x i16>
+; CHECK: mul nuw nsw <8 x i16>
+; COST: cost 28 for instruction: {{.*}} trunc <8 x i16>
+; CHECK trunc <8 x i16>
+; COST: cost of 2 for instruction: {{.*}} store <8 x i8>
+; CHECK: store <8 x i8>
+define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+  %cmp.32 = icmp sgt i32 %len, 0
+  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv11 = zext i8 %arg2 to i32
+  %conv13 = zext i8 %arg1 to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv = sext i16 %0 to i32
+  %add = shl nsw i32 %conv, 4
+  %conv2 = add nsw i32 %add, 32
+  %or = and i32 %conv, 204
+  %conv8 = or i32 %or, 51
+  %mul = mul nuw nsw i32 %conv8, 60
+  %and = and i32 %conv2, %conv13
+  %mul.masked = and i32 %mul, 252
+  %conv17 = xor i32 %mul.masked, %conv11
+  %mul18 = mul nuw nsw i32 %conv17, %and
+  %conv19 = trunc i32 %mul18 to i8
+  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv19, i8* %arrayidx21
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: load <4 x i16>
+; CHECK: load <4 x i16>
+; CHECK: shl nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: lshr <4 x i32>
+define void @add_g(i16* noalias nocapture readonly %p, i16* noalias nocapture readonly %q, i16* noalias nocapture %r, i16 %arg1, i32 %len) #0 {
+  %1 = icmp sgt i32 %len, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i16 %arg1 to i64
+  br label %3
+
+._crit_edge:                                      ; preds = %3, %0
+  ret void
+
+; <label>:3                                       ; preds = %3, %.lr.ph
+  %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %4 = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %5 = load i16, i16* %4
+  %6 = sext i16 %5 to i64
+  %7 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  %8 = load i16, i16* %7
+  %9 = sext i16 %8 to i64
+  %10 = shl nsw i64 %6, 4
+  %11 = mul nsw i64 %10, %9
+  %12 = add i64 %11, 16384
+  %13 = and i64 %12, %2
+  %14 = lshr i64 %13, 15
+  %15 = trunc i64 %14 to i16
+  %16 = getelementptr inbounds i16, i16* %r, i64 %indvars.iv
+  store i16 %15, i16* %16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %._crit_edge, label %3
+}
+
+attributes #0 = { nounwind }
+
+