Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -465,7 +465,8 @@
 
   /// Collect the instructions from the original loop that would be trivially
   /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions();
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
 
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
   /// represented as.
@@ -479,8 +480,9 @@
   /// and DST.
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
-  /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(BasicBlock *BB);
+  /// A helper function to vectorize a single instruction within the innermost
+  /// loop.
+  void vectorizeInstruction(Instruction &I);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
@@ -798,14 +800,6 @@
   // Record whether runtime checks are added.
   bool AddedSafetyChecks;
 
-  // Holds instructions from the original loop whose counterparts in the
-  // vectorized loop would be trivially dead if generated. For example,
-  // original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
@@ -3892,19 +3886,26 @@
   //
   //===------------------------------------------------===//
 
-  // Collect instructions from the original loop that will become trivially
-  // dead in the vectorized loop. We don't need to vectorize these
-  // instructions.
-  collectTriviallyDeadInstructions();
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
 
   // Scan the loop in a topological order to ensure that defs are vectorized
   // before users.
   LoopBlocksDFS DFS(OrigLoop);
   DFS.perform(LI);
 
-  // Vectorize all of the blocks in the original loop.
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    vectorizeBlockInLoop(BB);
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        vectorizeInstruction(I);
 
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
@@ -4334,7 +4335,8 @@
   }
 }
 
-void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+void InnerLoopVectorizer::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
   BasicBlock *Latch = OrigLoop->getLoopLatch();
 
   // We create new control-flow for the vectorized loop, so the original
@@ -4720,333 +4722,323 @@
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) {
-  // For each instruction in the old loop.
-  for (Instruction &I : *BB) {
-
-    // If the instruction will become trivially dead when vectorized, we don't
-    // need to generate it.
-    if (DeadInstructions.count(&I))
-      continue;
+void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
+  // Scalarize instructions that should remain scalar after vectorization.
+  if (VF > 1 &&
+      !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
+      shouldScalarizeInstruction(&I)) {
+    scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
+    return;
+  }
 
-    // Scalarize instructions that should remain scalar after vectorization.
-    if (VF > 1 &&
-        !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
-          isa<DbgInfoIntrinsic>(&I)) &&
-        shouldScalarizeInstruction(&I)) {
-      scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
-      continue;
-    }
+  switch (I.getOpcode()) {
+  case Instruction::Br:
+    // Nothing to do for PHIs and BR, since we already took care of the
+    // loop control flow instructions.
+    break;
+  case Instruction::PHI: {
+    // Vectorize PHINodes.
+    widenPHIInstruction(&I, UF, VF);
+    break;
+  } // End of PHI.
+  case Instruction::GetElementPtr: {
+    // Construct a vector GEP by widening the operands of the scalar GEP as
+    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+    // results in a vector of pointers when at least one operand of the GEP
+    // is vector-typed. Thus, to keep the representation compact, we only use
+    // vector-typed operands for loop-varying values.
+    auto *GEP = cast<GetElementPtrInst>(&I);
+    VectorParts Entry(UF);
 
-    switch (I.getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI: {
-      // Vectorize PHINodes.
-      widenPHIInstruction(&I, UF, VF);
-      continue;
-    } // End of PHI.
-    case Instruction::GetElementPtr: {
-      // Construct a vector GEP by widening the operands of the scalar GEP as
-      // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
-      // results in a vector of pointers when at least one operand of the GEP
-      // is vector-typed. Thus, to keep the representation compact, we only use
-      // vector-typed operands for loop-varying values.
-      auto *GEP = cast<GetElementPtrInst>(&I);
-      VectorParts Entry(UF);
+    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+      // If we are vectorizing, but the GEP has only loop-invariant operands,
+      // the GEP we build (by only using vector-typed operands for
+      // loop-varying values) would be a scalar pointer. Thus, to ensure we
+      // produce a vector of pointers, we need to either arbitrarily pick an
+      // operand to broadcast, or broadcast a clone of the original GEP.
+      // Here, we broadcast a clone of the original.
+      //
+      // TODO: If at some point we decide to scalarize instructions having
+      //       loop-invariant operands, this special case will no longer be
+      //       required. We would add the scalarization decision to
+      //       collectLoopScalars() and teach getVectorValue() to broadcast
+      //       the lane-zero scalar value.
+      auto *Clone = Builder.Insert(GEP->clone());
+      for (unsigned Part = 0; Part < UF; ++Part)
+        Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+    } else {
+      // If the GEP has at least one loop-varying operand, we are sure to
+      // produce a vector of pointers. But if we are only unrolling, we want
+      // to produce a scalar GEP for each unroll part. Thus, the GEP we
+      // produce with the code below will be scalar (if VF == 1) or vector
+      // (otherwise). Note that for the unroll-only case, we still maintain
+      // values in the vector mapping with initVector, as we do for other
+      // instructions.
+      for (unsigned Part = 0; Part < UF; ++Part) {
 
-      if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
-        // If we are vectorizing, but the GEP has only loop-invariant operands,
-        // the GEP we build (by only using vector-typed operands for
-        // loop-varying values) would be a scalar pointer. Thus, to ensure we
-        // produce a vector of pointers, we need to either arbitrarily pick an
-        // operand to broadcast, or broadcast a clone of the original GEP.
-        // Here, we broadcast a clone of the original.
-        //
-        // TODO: If at some point we decide to scalarize instructions having
-        //       loop-invariant operands, this special case will no longer be
-        //       required. We would add the scalarization decision to
-        //       collectLoopScalars() and teach getVectorValue() to broadcast
-        //       the lane-zero scalar value.
-        auto *Clone = Builder.Insert(GEP->clone());
-        for (unsigned Part = 0; Part < UF; ++Part)
-          Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
-      } else {
-        // If the GEP has at least one loop-varying operand, we are sure to
-        // produce a vector of pointers. But if we are only unrolling, we want
-        // to produce a scalar GEP for each unroll part. Thus, the GEP we
-        // produce with the code below will be scalar (if VF == 1) or vector
-        // (otherwise). Note that for the unroll-only case, we still maintain
-        // values in the vector mapping with initVector, as we do for other
-        // instructions.
-        for (unsigned Part = 0; Part < UF; ++Part) {
-
-          // The pointer operand of the new GEP. If it's loop-invariant, we
-          // won't broadcast it.
-          auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
-                          ? GEP->getPointerOperand()
-                          : getVectorValue(GEP->getPointerOperand())[Part];
-
-          // Collect all the indices for the new GEP. If any index is
-          // loop-invariant, we won't broadcast it.
-          SmallVector<Value *, 4> Indices;
-          for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
-            if (OrigLoop->isLoopInvariant(U.get()))
-              Indices.push_back(U.get());
-            else
-              Indices.push_back(getVectorValue(U.get())[Part]);
-          }
-
-          // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-          // but it should be a vector, otherwise.
-          auto *NewGEP = GEP->isInBounds()
-                             ? Builder.CreateInBoundsGEP(Ptr, Indices)
-                             : Builder.CreateGEP(Ptr, Indices);
-          assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
-                 "NewGEP is not a pointer vector");
-          Entry[Part] = NewGEP;
+        // The pointer operand of the new GEP. If it's loop-invariant, we
+        // won't broadcast it.
+        auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                        ? GEP->getPointerOperand()
+                        : getVectorValue(GEP->getPointerOperand())[Part];
+
+        // Collect all the indices for the new GEP. If any index is
+        // loop-invariant, we won't broadcast it.
+        SmallVector<Value *, 4> Indices;
+        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+          if (OrigLoop->isLoopInvariant(U.get()))
+            Indices.push_back(U.get());
+          else
+            Indices.push_back(getVectorValue(U.get())[Part]);
         }
+
+        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+        // but it should be a vector, otherwise.
+        auto *NewGEP = GEP->isInBounds()
+                           ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                           : Builder.CreateGEP(Ptr, Indices);
+        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+               "NewGEP is not a pointer vector");
+        Entry[Part] = NewGEP;
       }
+    }
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, GEP);
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, GEP);
+    break;
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // Scalarize with predication if this instruction may divide by zero and
+    // block execution is conditional, otherwise fallthrough.
+    if (Legal->isScalarWithPredication(&I)) {
+      scalarizeInstruction(&I, true);
       break;
     }
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::SRem:
-    case Instruction::URem:
-      // Scalarize with predication if this instruction may divide by zero and
-      // block execution is conditional, otherwise fallthrough.
-      if (Legal->isScalarWithPredication(&I)) {
-        scalarizeInstruction(&I, true);
-        continue;
-      }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::FDiv:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      // Just widen binops.
-      auto *BinOp = cast<BinaryOperator>(&I);
-      setDebugLocFromInst(Builder, BinOp);
-      const VectorParts &A = getVectorValue(BinOp->getOperand(0));
-      const VectorParts &B = getVectorValue(BinOp->getOperand(1));
-
-      // Use this vector value for all users of the original instruction.
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen binops.
+    auto *BinOp = cast<BinaryOperator>(&I);
+    setDebugLocFromInst(Builder, BinOp);
+    const VectorParts &A = getVectorValue(BinOp->getOperand(0));
+    const VectorParts &B = getVectorValue(BinOp->getOperand(1));
 
-        if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
-          VecOp->copyIRFlags(BinOp);
+    // Use this vector value for all users of the original instruction.
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
 
-        Entry[Part] = V;
-      }
+      if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+        VecOp->copyIRFlags(BinOp);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, BinOp);
-      break;
+      Entry[Part] = V;
     }
-    case Instruction::Select: {
-      // Widen selects.
-      // If the selector is loop invariant we can create a select
-      // instruction with a scalar condition. Otherwise, use vector-select.
-      auto *SE = PSE.getSE();
-      bool InvariantCond =
-          SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
-      setDebugLocFromInst(Builder, &I);
-
-      // The condition can be loop invariant  but still defined inside the
-      // loop. This means that we can't just use the original 'cond' value.
-      // We have to take the 'vectorized' value and pick the first lane.
-      // Instcombine will make this a no-op.
-      const VectorParts &Cond = getVectorValue(I.getOperand(0));
-      const VectorParts &Op0 = getVectorValue(I.getOperand(1));
-      const VectorParts &Op1 = getVectorValue(I.getOperand(2));
 
-      auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, BinOp);
+    break;
+  }
+  case Instruction::Select: {
+    // Widen selects.
+    // If the selector is loop invariant we can create a select
+    // instruction with a scalar condition. Otherwise, use vector-select.
+    auto *SE = PSE.getSE();
+    bool InvariantCond =
+        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+    setDebugLocFromInst(Builder, &I);
+
+    // The condition can be loop invariant  but still defined inside the
+    // loop. This means that we can't just use the original 'cond' value.
+    // We have to take the 'vectorized' value and pick the first lane.
+    // Instcombine will make this a no-op.
+    const VectorParts &Cond = getVectorValue(I.getOperand(0));
+    const VectorParts &Op0 = getVectorValue(I.getOperand(1));
+    const VectorParts &Op1 = getVectorValue(I.getOperand(2));
 
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Entry[Part] = Builder.CreateSelect(
-            InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
-      }
+    auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Entry[Part] = Builder.CreateSelect(
+          InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
     }
 
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      // Widen compares. Generate vector compares.
-      bool FCmp = (I.getOpcode() == Instruction::FCmp);
-      auto *Cmp = dyn_cast<CmpInst>(&I);
-      setDebugLocFromInst(Builder, Cmp);
-      const VectorParts &A = getVectorValue(Cmp->getOperand(0));
-      const VectorParts &B = getVectorValue(Cmp->getOperand(1));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *C = nullptr;
-        if (FCmp) {
-          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
-          cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
-        } else {
-          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
-        }
-        Entry[Part] = C;
-      }
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = dyn_cast<CmpInst>(&I);
+    setDebugLocFromInst(Builder, Cmp);
+    const VectorParts &A = getVectorValue(Cmp->getOperand(0));
+    const VectorParts &B = getVectorValue(Cmp->getOperand(1));
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *C = nullptr;
+      if (FCmp) {
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+      }
+      Entry[Part] = C;
     }
 
-    case Instruction::Store:
-    case Instruction::Load:
-      vectorizeMemoryInstruction(&I);
-      break;
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      auto *CI = dyn_cast<CastInst>(&I);
-      setDebugLocFromInst(Builder, CI);
-
-      // Optimize the special case where the source is a constant integer
-      // induction variable. Notice that we can only optimize the 'trunc' case
-      // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
-      // (c) other casts depend on pointer size.
-      if (Cost->isOptimizableIVTruncate(CI, VF)) {
-        widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
-                              cast<TruncInst>(CI));
-        break;
-      }
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
 
-      /// Vectorize casts.
-      Type *DestTy =
-          (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+  case Instruction::Store:
+  case Instruction::Load:
+    vectorizeMemoryInstruction(&I);
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = dyn_cast<CastInst>(&I);
+    setDebugLocFromInst(Builder, CI);
 
-      const VectorParts &A = getVectorValue(CI->getOperand(0));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
+    // Optimize the special case where the source is a constant integer
+    // induction variable. Notice that we can only optimize the 'trunc' case
+    // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+    // (c) other casts depend on pointer size.
+    if (Cost->isOptimizableIVTruncate(CI, VF)) {
+      widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                            cast<TruncInst>(CI));
       break;
     }
 
-    case Instruction::Call: {
-      // Ignore dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(I))
-        break;
-      setDebugLocFromInst(Builder, &I);
+    /// Vectorize casts.
+    Type *DestTy =
+        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
-      Module *M = BB->getParent()->getParent();
-      auto *CI = cast<CallInst>(&I);
+    const VectorParts &A = getVectorValue(CI->getOperand(0));
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
 
-      StringRef FnName = CI->getCalledFunction()->getName();
-      Function *F = CI->getCalledFunction();
-      Type *RetTy = ToVectorTy(CI->getType(), VF);
-      SmallVector<Type *, 4> Tys;
-      for (Value *ArgOperand : CI->arg_operands())
-        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-                 ID == Intrinsic::lifetime_start)) {
-        scalarizeInstruction(&I);
-        break;
-      }
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
-      if (!UseVectorIntrinsic && NeedToScalarize) {
-        scalarizeInstruction(&I);
-        break;
-      }
+  case Instruction::Call: {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      break;
+    setDebugLocFromInst(Builder, &I);
 
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Args;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          Value *Arg = CI->getArgOperand(i);
-          // Some intrinsics have a scalar argument - don't replace it with a
-          // vector.
-          if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
-            const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
-            Arg = VectorArg[Part];
-          }
-          Args.push_back(Arg);
-        }
+    Module *M = I.getParent()->getParent()->getParent();
+    auto *CI = cast<CallInst>(&I);
 
-        Function *VectorF;
-        if (UseVectorIntrinsic) {
-          // Use vector version of the intrinsic.
-          Type *TysForDecl[] = {CI->getType()};
-          if (VF > 1)
-            TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-          VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-        } else {
-          // Use vector version of the library call.
-          StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
-          assert(!VFnName.empty() && "Vector function name is empty.");
-          VectorF = M->getFunction(VFnName);
-          if (!VectorF) {
-            // Generate a declaration
-            FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
-            VectorF =
-                Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
-            VectorF->copyAttributesFrom(F);
-          }
+    StringRef FnName = CI->getCalledFunction()->getName();
+    Function *F = CI->getCalledFunction();
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type *, 4> Tys;
+    for (Value *ArgOperand : CI->arg_operands())
+      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+               ID == Intrinsic::lifetime_start)) {
+      scalarizeInstruction(&I);
+      break;
+    }
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+    if (!UseVectorIntrinsic && NeedToScalarize) {
+      scalarizeInstruction(&I);
+      break;
+    }
+
+    VectorParts Entry(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Value *, 4> Args;
+      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+        Value *Arg = CI->getArgOperand(i);
+        // Some intrinsics have a scalar argument - don't replace it with a
+        // vector.
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
+          const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+          Arg = VectorArg[Part];
         }
-        assert(VectorF && "Can't create vector function.");
+        Args.push_back(Arg);
+      }
 
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        CI->getOperandBundlesAsDefs(OpBundles);
-        CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+      Function *VectorF;
+      if (UseVectorIntrinsic) {
+        // Use vector version of the intrinsic.
+        Type *TysForDecl[] = {CI->getType()};
+        if (VF > 1)
+          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      } else {
+        // Use vector version of the library call.
+        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        assert(!VFnName.empty() && "Vector function name is empty.");
+        VectorF = M->getFunction(VFnName);
+        if (!VectorF) {
+          // Generate a declaration
+          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+          VectorF =
+              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+          VectorF->copyAttributesFrom(F);
+        }
+      }
+      assert(VectorF && "Can't create vector function.");
 
-        if (isa<FPMathOperator>(V))
-          V->copyFastMathFlags(CI);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
 
-        Entry[Part] = V;
-      }
+      if (isa<FPMathOperator>(V))
+        V->copyFastMathFlags(CI);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+      Entry[Part] = V;
     }
 
-    default:
-      // All other instructions are unsupported. Scalarize them.
-      scalarizeInstruction(&I);
-      break;
-    } // end of switch.
-  }   // end of for_each instr.
+    VectorLoopValueMap.initVector(&I, Entry);
+    addMetadata(Entry, &I);
+    break;
+  }
+
+  default:
+    // All other instructions are unsupported. Scalarize them.
+    scalarizeInstruction(&I);
+    break;
+  } // end of switch.
 }
 
 void InnerLoopVectorizer::updateAnalysis() {