Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -384,11 +384,11 @@
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
-  /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
   /// scalarized instruction behind an if block predicated on the control
   /// dependence of the instruction.
   virtual void scalarizeInstruction(Instruction *Instr,
-                                    bool IfPredicateStore = false);
+                                    bool IfPredicateInstr = false);
 
   /// Vectorize Load and Store instructions,
   virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -591,7 +591,7 @@
 
   /// Store instructions that should be predicated, as a pair
   ///   <StoreInst, Predicate>
-  SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
+  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
   EdgeMaskCache MaskCache;
   /// Trip count of the original loop.
   Value *TripCount;
@@ -625,7 +625,7 @@
 
 private:
   void scalarizeInstruction(Instruction *Instr,
-                            bool IfPredicateStore = false) override;
+                            bool IfPredicateInstr = false) override;
   void vectorizeMemoryInstruction(Instruction *Instr) override;
   Value *getBroadcastInstrs(Value *V) override;
   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2675,8 +2675,11 @@
 }
 
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
-                                               bool IfPredicateStore) {
+                                               bool IfPredicateInstr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  DEBUG(dbgs() << "LV: Scalarizing"
+        << (IfPredicateInstr ? " and predicating:" : ":")
+        << *Instr << '\n');
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<VectorParts, 4> Params;
 
@@ -2720,7 +2723,7 @@
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   VectorParts Cond;
-  if (IfPredicateStore) {
+  if (IfPredicateInstr) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@@ -2734,7 +2737,7 @@
 
       // Start if-block.
       Value *Cmp = nullptr;
-      if (IfPredicateStore) {
+      if (IfPredicateInstr) {
         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
                                  ConstantInt::get(Cmp->getType(), 1));
@@ -2773,9 +2776,8 @@
         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
                                                        Builder.getInt32(Width));
       // End if-block.
-      if (IfPredicateStore)
-        PredicatedStores.push_back(
-            std::make_pair(cast<StoreInst>(Cloned), Cmp));
+      if (IfPredicateInstr)
+        PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
     }
   }
 }
@@ -3306,9 +3308,12 @@
   return V;
 }
 
-/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
-/// the result needs to be inserted and/or extracted from vectors.
+/// \brief Estimate the overhead of scalarizing a value. Insert and Extract are
+/// set if the result needs to be inserted and/or extracted from vectors.
+/// If the instruction is also to be predicated, insertion cost is added the
+/// cost of the PHI node required.
 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+                                         bool Predicate,
                                          const TargetTransformInfo &TTI) {
   if (Ty->isVoidTy())
     return 0;
@@ -3317,15 +3322,56 @@
   unsigned Cost = 0;
 
   for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
-    if (Insert)
+    if (Insert) {
+      if (Predicate)
+        Cost += TTI.getCFInstrCost(Instruction::PHI);
       Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
+    }
     if (Extract)
       Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
   }
 
+  // We assume that if-converted blocks have a 50% chance of being executed.
+  // Predicated scalarized instructions are avoided due to the CF that bypasses
+  // turned off lanes. The extracts and inserts will be sinked/hoisted to the
+  // predicated basic-block and are subjected to the same assumption.
+  if (Predicate)
+    Cost /= 2;
+
   return Cost;
 }
 
+/// \brief Estimate the overhead of scalarizing an instruction based on the
+/// types of its operand and return value.
+static unsigned getScalarizationOverhead(SmallVectorImpl<Type *>& OpTys,
+                                         Type *RetTy, bool Predicate,
+                                         const TargetTransformInfo &TTI) {
+  unsigned ScalarizationCost =
+      getScalarizationOverhead(RetTy, true, false, Predicate, TTI);
+
+  for (Type *Ty : OpTys)
+    ScalarizationCost +=
+        getScalarizationOverhead(Ty, false, true, Predicate, TTI);
+
+  return ScalarizationCost;
+}
+
+/// \brief Estimate the overhead of scalarizing an instruction based on the
+/// types of its operand and return value.
+static unsigned getScalarizationOverhead(Instruction *I, Type* RetTy,
+                                         unsigned VF, bool Predicate,
+                                         const TargetTransformInfo &TTI) {
+  if (VF == 1)
+    return 0;
+
+  SmallVector<Type *, 4> OpTys;
+  unsigned OperandsNum = I->getNumOperands();
+  for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
+    OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+
+  return getScalarizationOverhead(OpTys, RetTy, Predicate, TTI);
+}
+
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
 // Return the cost of the instruction, including scalarization overhead if it's
 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
@@ -3356,10 +3402,7 @@
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, TTI);
-  for (Type *Ty : Tys)
-    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -3779,15 +3822,61 @@
   // Make sure DomTree is updated.
   updateAnalysis();
 
-  // Predicate any stores.
-  for (auto KV : PredicatedStores) {
+  // Predicate any instructions.
+  for (auto KV : PredicatedInstructions) {
     BasicBlock::iterator I(KV.first);
-    auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+    BasicBlock *Head = I->getParent();
+    auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
                                         /*BranchWeights=*/nullptr, DT, LI);
     I->moveBefore(T);
-    I->getParent()->setName("pred.store.if");
-    BB->setName("pred.store.continue");
+    // Try to move any extract-element we may have created for the predicated
+    // instruction into the Then block.
+    for (unsigned In = 0; In < I->getNumOperands(); ++In) {
+      ExtractElementInst *OpInst =
+          dyn_cast<ExtractElementInst>(I->getOperand(In));
+      if (!OpInst)
+        continue;
+      bool CanSinkToUse = true;
+      for (User *U : OpInst->users()) {
+        if (U != &*I) {
+          CanSinkToUse = false;
+          break;
+        }
+      }
+      if (CanSinkToUse)
+        OpInst->moveBefore(&*I);
+    }
+    I->getParent()->setName(Twine("pred.") + Twine(I->getOpcodeName()) +
+                            Twine(".if"));
+    BB->setName(Twine("pred.") + Twine(I->getOpcodeName()) +
+                Twine(".continue"));
+    // If the instruction is non-void create a Phi node at reconvergence point.
+    if (!I->getType()->isVoidTy()) {
+      Value *IncomingTrue = nullptr;
+      Value *IncomingFalse = nullptr;
+      auto Users = I->users();
+      if (std::distance(Users.begin(), Users.end()) == 1 &&
+          isa<InsertElementInst>(*I->users().begin())) {
+        // If the predicated instruction is feeding an insert-element, move it
+        // into the Then block; Phi node will be created for the vector.
+        InsertElementInst *IEI = cast<InsertElementInst>(*Users.begin());
+        IEI->moveBefore(T);
+        IncomingTrue = IEI; // the new vector with the inserted element.
+        IncomingFalse = IEI->getOperand(0); // the unmodified vector
+      } else {
+        // Phi node will be created for the scalar predicated instruction.
+        IncomingTrue = &*I;
+        IncomingFalse = UndefValue::get(I->getType());
+      }
+      BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
+      assert(PostDom && "Then block has multiple successors");
+      PHINode *Phi =
+          PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
+      IncomingTrue->replaceAllUsesWith(Phi);
+      Phi->addIncoming(IncomingFalse, Head);
+      Phi->addIncoming(IncomingTrue, I->getParent());
+    }
   }
   DEBUG(DT->verifyDomTree());
   // Remove redundant induction instructions.
@@ -4156,18 +4245,23 @@
       continue;
     } // End of PHI.
 
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::FRem:
+      // Scalarize if block needs predication, otherwise fallthrough.
+      if (Legal->blockNeedsPredication(I.getParent())) {
+        scalarizeInstruction(&I, true);
+        continue;
+      }
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
     case Instruction::FSub:
     case Instruction::Mul:
     case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
     case Instruction::FDiv:
     case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
@@ -4998,17 +5092,6 @@
     }
     if (I.mayThrow())
       return false;
-
-    // The instructions below can trap.
-    switch (I.getOpcode()) {
-    default:
-      continue;
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-      return false;
-    }
   }
 
   return true;
@@ -5781,7 +5864,6 @@
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
-
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     VectorizationCostTy BlockCost;
@@ -5938,17 +6020,23 @@
     // TODO: IF-converted IFs become selects.
     return 0;
   }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // Predicated scalarized instructions are avoided due to the CF that
+    // bypasses turned off lanes. If we are not predicating, fallthrough.
+    if (VF > 1 && Legal->blockNeedsPredication(I->getParent()))
+      return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / 2 +
+             getScalarizationOverhead(I, VectorTy, VF, true, TTI);
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
   case Instruction::FSub:
   case Instruction::Mul:
   case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
   case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
   case Instruction::FRem:
   case Instruction::Shl:
   case Instruction::LShr:
@@ -6182,28 +6270,11 @@
       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
     return CallCost;
   }
-  default: {
-    // We are scalarizing the instruction. Return the cost of the scalar
-    // instruction, plus the cost of insert and extract into vector
-    // elements, times the vector width.
-    unsigned Cost = 0;
-
-    if (!RetTy->isVoidTy() && VF != 1) {
-      unsigned InsCost =
-          TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy);
-      unsigned ExtCost =
-          TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy);
-
-      // The cost of inserting the results plus extracting each one of the
-      // operands.
-      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
-    }
-
+  default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
-    return Cost;
-  }
+    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+           getScalarizationOverhead(I, VectorTy, VF, false, TTI);
   } // end of switch.
 }
 
@@ -6307,7 +6378,7 @@
 }
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
-                                             bool IfPredicateStore) {
+                                             bool IfPredicateInstr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<VectorParts, 4> Params;
@@ -6350,7 +6421,7 @@
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   VectorParts Cond;
-  if (IfPredicateStore) {
+  if (IfPredicateInstr) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@@ -6363,7 +6434,7 @@
 
     // Start an "if (pred) a[i] = ..." block.
     Value *Cmp = nullptr;
-    if (IfPredicateStore) {
+    if (IfPredicateInstr) {
       if (Cond[Part]->getType()->isVectorTy())
         Cond[Part] =
             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
@@ -6394,16 +6465,16 @@
       VecResults[Part] = Cloned;
 
     // End if-block.
-    if (IfPredicateStore)
-      PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp));
+    if (IfPredicateInstr)
+      PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
   }
 }
 
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
   auto *SI = dyn_cast<StoreInst>(Instr);
-  bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+  bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
 
-  return scalarizeInstruction(Instr, IfPredicateStore);
+  return scalarizeInstruction(Instr, IfPredicateInstr);
 }
 
 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
Index: test/Transforms/LoopVectorize/if-pred-non-void.ll
===================================================================
--- test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test predication of non-void instructions, specifically the creation of an
+; insertelement and a Phi node.
+define void @test(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK:   br i1 %{{.*}}, label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
+; CHECK: [[cond]]:
+; CHECK:  %[[v0:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:  %[[v1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[v0]], i32 0
+; CHECK:   br label %[[else]]
+; CHECK: [[else]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %vector.body ], [ %[[v1]], %[[cond]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 27
+  %cmp1 = icmp slt i32 %0, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %div = sdiv i32 %add, %0
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %y.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
+  store i32 %y.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
Index: test/Transforms/LoopVectorize/if-pred-stores.ll
===================================================================
--- test/Transforms/LoopVectorize/if-pred-stores.ll
+++ test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -1,7 +1,6 @@
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -17,49 +16,27 @@
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
 ; VEC:   %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
 ; VEC:   %[[v12:.+]] = icmp eq i1 %[[v11]], true
-; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
-; VEC:   %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
 ; VEC:   br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
 ;
 ; VEC: [[cond]]:
+; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC:   %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
 ; VEC:   store i32 %[[v13]], i32* %[[v14]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
 ; VEC: [[else]]:
 ; VEC:   %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
 ; VEC:   %[[v16:.+]] = icmp eq i1 %[[v15]], true
-; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
-; VEC:   %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
 ; VEC:   br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
 ;
 ; VEC: [[cond2]]:
+; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC:   %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
 ; VEC:   store i32 %[[v17]], i32* %[[v18]], align 4
 ; VEC:   br label %[[else2:.+]]
 ;
 ; VEC: [[else2]]:
 
-; VEC-IC-LABEL: test
-; VEC-IC:   %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
-; VEC-IC:   %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
-; VEC-IC:   %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0
-; VEC-IC:   br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]]
-;
-; VEC-IC: [[cond]]:
-; VEC-IC:   %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0
-; VEC-IC:   store i32 %[[v4]], i32* %{{.*}}, align 4
-; VEC-IC:   br label %[[else:.+]]
-;
-; VEC-IC: [[else]]:
-; VEC-IC:   %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1
-; VEC-IC:   br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]]
-;
-; VEC-IC: [[cond2]]:
-; VEC-IC:   %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1
-; VEC-IC:   store i32 %[[v6]], i32* %{{.*}}, align 4
-; VEC-IC:   br label %[[else2:.+]]
-;
-; VEC-IC: [[else2]]:
-
 ; UNROLL-LABEL: test
 ; UNROLL: vector.body:
 ; UNROLL:   %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0