Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -252,10 +252,12 @@
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class LoopVectorizationRequirements;
+class VPBlendRecipe;
 class VPInterleaveRecipe;
 class VPReplicateRecipe;
 class VPWidenIntOrFpInductionRecipe;
 class VPWidenRecipe;
+class VPWidenMemoryInstructionRecipe;
 
 /// Returns true if the given loop body has a cycle, excluding the loop
 /// itself.
@@ -426,6 +428,10 @@
   /// mask for the block BB.
   VectorParts createBlockInMask(BasicBlock *BB);
 
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -478,6 +484,13 @@
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
 
+  /// Vectorize Load and Store instructions,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr);
+
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
 protected:
   /// A small list of PHINodes.
   typedef SmallVector<PHINode *, 4> PhiVector;
@@ -528,17 +541,10 @@
   /// represented as.
   void truncateToMinimalBitwidths();
 
-  /// A helper function that computes the predicate of the edge between SRC
-  /// and DST.
-  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
 
-  /// Vectorize Load and Store instructions,
-  virtual void vectorizeMemoryInstruction(Instruction *Instr);
-
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -617,10 +623,6 @@
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
-  /// \brief Set the debug location in the builder using the debug location in
-  /// the instruction.
-  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
-
   /// The original loop.
   Loop *OrigLoop;
   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
@@ -2208,6 +2210,11 @@
   /// to \p Range.End.
   VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
 
+  // Check if \I is a memory instruction to be widened for \p Range.Start and
+  // potentially masked.
+  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
+                                                   VFRange &Range);
+
   /// Check if an induction recipe should be constructed for \I within the given
   /// VF \p Range. If so build and return it. If not, return null. \p Range.End
   /// may be decreased to ensure same decision from \p Range.Start to
@@ -2215,6 +2222,11 @@
   VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
                                                         VFRange &Range);
 
+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+  /// a sequence of select instructions as the vectorizer currently performs
+  /// full if-conversion.
+  VPBlendRecipe *tryToBlend(Instruction *I);
+
   /// Check if \p I can be widened within the given VF \p Range. If \p I can be
   /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
   /// extended to include \p I or else build a new VPWidenRecipe for it and
@@ -4406,77 +4418,6 @@
   } while (Changed);
 }
 
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
-  // Look for cached value.
-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
-  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
-  if (ECEntryIt != EdgeMaskCache.end())
-    return ECEntryIt->second;
-
-  VectorParts SrcMask = createBlockInMask(Src);
-
-  // The terminator has to be a branch inst!
-  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
-  assert(BI && "Unexpected terminator found");
-
-  if (!BI->isConditional())
-    return EdgeMaskCache[Edge] = SrcMask;
-
-  VectorParts EdgeMask(UF);
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
-    if (BI->getSuccessor(0) != Dst)
-      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
-
-    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
-      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
-
-    EdgeMask[Part] = EdgeMaskPart;
-  }
-
-  return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
-  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
-  // Look for cached value.
-  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
-  if (BCEntryIt != BlockMaskCache.end())
-    return BCEntryIt->second;
-
-  // All-one mask is modelled as no-mask following the convention for masked
-  // load/store/gather/scatter. Initialize BlockMask to no-mask.
-  VectorParts BlockMask(UF);
-  for (unsigned Part = 0; Part < UF; ++Part)
-    BlockMask[Part] = nullptr;
-
-  // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
-    return BlockMaskCache[BB] = BlockMask;
-
-  // This is the block mask. We OR all incoming edges.
-  for (auto *Predecessor : predecessors(BB)) {
-    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
-    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
-      return BlockMaskCache[BB] = EdgeMask;
-
-    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
-      BlockMask = EdgeMask;
-      continue;
-    }
-
-    for (unsigned Part = 0; Part < UF; ++Part)
-      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
-  }
-
-  return BlockMaskCache[BB] = BlockMask;
-}
-
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                               unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
@@ -4497,43 +4438,6 @@
   }
 
   setDebugLocFromInst(Builder, P);
-  // Check for PHI nodes that are lowered to vector selects.
-  if (P->getParent() != OrigLoop->getHeader()) {
-    // We know that all PHIs in non-header blocks are converted into
-    // selects, so we don't have to worry about the insertion order and we
-    // can just use the builder.
-    // At this point we generate the predication tree. There may be
-    // duplications since this is a simple recursive scan, but future
-    // optimizations will clean it up.
-
-    unsigned NumIncoming = P->getNumIncomingValues();
-
-    // Generate a sequence of selects of the form:
-    // SELECT(Mask3, In3,
-    //      SELECT(Mask2, In2,
-    //                   ( ...)))
-    VectorParts Entry(UF);
-    for (unsigned In = 0; In < NumIncoming; In++) {
-      VectorParts Cond =
-          createEdgeMask(P->getIncomingBlock(In), P->getParent());
-
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
-        assert((Cond[Part] || NumIncoming == 1) &&
-               "Multiple predecessors with one predecessor having a full mask");
-        if (In == 0)
-          Entry[Part] = In0; // Initialize with the first incoming value.
-        else
-          // Select between the current value and the previous incoming edge
-          // based on the incoming mask.
-          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
-                                             "predphi");
-      }
-    }
-    for (unsigned Part = 0; Part < UF; ++Part)
-      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
-    return;
-  }
 
   // This PHINode must be an induction variable.
   // Make sure that we know about it.
@@ -4758,10 +4662,6 @@
     break;
   }
 
-  case Instruction::Store:
-  case Instruction::Load:
-    vectorizeMemoryInstruction(&I);
-    break;
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -4866,7 +4766,7 @@
   }
 
   default:
-    // All other instructions are scalarized.
+    // This instruction is not vectorized by simple widening.
     DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
@@ -7752,6 +7652,82 @@
   }
 };
 
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
+class VPBlendRecipe : public VPRecipeBase {
+private:
+  PHINode *Phi;
+
+public:
+  VPBlendRecipe(PHINode *Phi) : VPRecipeBase(VPBlendSC), Phi(Phi) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override {
+    State.ILV->setDebugLocFromInst(State.Builder, Phi);
+    // We know that all PHIs in non-header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = Phi->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    InnerLoopVectorizer::VectorParts Entry(State.UF);
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      InnerLoopVectorizer::VectorParts Cond =
+        State.ILV->createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
+
+      for (unsigned Part = 0; Part < State.UF; ++Part) {
+        Value *In0 =
+          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+        assert((Cond[Part] || NumIncoming == 1) &&
+               "Multiple predecessors with one predecessor having a full mask");
+        if (In == 0)
+          Entry[Part] = In0; // Initialize with the first incoming value.
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[Part] = State.Builder.CreateSelect(Cond[Part], In0, Entry[Part],
+                                                   "predphi");
+      }
+    }
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+  }
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent) const override {
+    O << " +\n" << Indent << "\"BLEND ";
+    Phi->printAsOperand(O, false);
+    O << " =";
+    if (Phi->getNumIncomingValues() == 1) {
+      // Not a User of any mask: not really blending, this is a
+      // single-predecessor phi.
+      O << " ";
+      Phi->getIncomingValue(0)->printAsOperand(O, false);
+    } else {
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; ++I) {
+        O << " ";
+        Phi->getIncomingValue(I)->printAsOperand(O, false);
+        O << "/";
+        Phi->getIncomingBlock(I)->printAsOperand(O, false);
+      }
+    }
+    O << "\\l\"";
+
+  }
+};
+
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
 /// or stores into one wide load/store and shuffles.
 class VPInterleaveRecipe : public VPRecipeBase {
@@ -7896,6 +7872,32 @@
       << "\\l\"";
   }
 };
+
+/// A Recipe for widening load/store operations.
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+private:
+  Instruction &Instr;
+
+public:
+  VPWidenMemoryInstructionRecipe(Instruction &Instr)
+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+  }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override {
+    State.ILV->vectorizeMemoryInstruction(&Instr);
+  }
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent) const override {
+    O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+    O << "\\l\"";
+  }
+};
 } // end anonymous namespace
 
 bool LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -7926,6 +7928,77 @@
   }
 }
 
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+  // Look for cached value.
+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+  if (ECEntryIt != EdgeMaskCache.end())
+    return ECEntryIt->second;
+
+  VectorParts SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (!BI->isConditional())
+    return EdgeMaskCache[Edge] = SrcMask;
+
+  VectorParts EdgeMask(UF);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+    if (BI->getSuccessor(0) != Dst)
+      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
+
+    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+
+    EdgeMask[Part] = EdgeMaskPart;
+  }
+
+  return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Look for cached value.
+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+  if (BCEntryIt != BlockMaskCache.end())
+    return BCEntryIt->second;
+
+  // All-one mask is modelled as no-mask following the convention for masked
+  // load/store/gather/scatter. Initialize BlockMask to no-mask.
+  VectorParts BlockMask(UF);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = nullptr;
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB)
+    return BlockMaskCache[BB] = BlockMask;
+
+  // This is the block mask. We OR all incoming edges.
+  for (auto *Predecessor : predecessors(BB)) {
+    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
+    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
+      return BlockMaskCache[BB] = EdgeMask;
+
+    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
+      BlockMask = EdgeMask;
+      continue;
+    }
+
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
+  }
+
+  return BlockMaskCache[BB] = BlockMask;
+}
+
 VPInterleaveRecipe *
 LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
                                                 VFRange &Range) {
@@ -7953,6 +8026,32 @@
   return new VPInterleaveRecipe(IG);
 }
 
+VPWidenMemoryInstructionRecipe *
+LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range) {
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return nullptr;
+
+  auto willWiden = [&](unsigned VF) -> bool {
+    if (VF == 1)
+      return false;
+    if (CM.isScalarAfterVectorization(I, VF) ||
+        CM.isProfitableToScalarize(I, VF))
+      return false;
+    LoopVectorizationCostModel::InstWidening Decision =
+        CM.getWideningDecision(I, VF);
+    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+           "CM decision should be taken at this point.");
+    assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+           "Interleave memory opportunity should be caught earlier.");
+    return Decision != LoopVectorizationCostModel::CM_Scalarize;
+  };
+
+  if (!getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  return new VPWidenMemoryInstructionRecipe(*I);
+}
+
 VPWidenIntOrFpInductionRecipe *
 LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
                                                  VFRange &Range) {
@@ -7987,6 +8086,14 @@
   return nullptr;
 }
 
+VPBlendRecipe *LoopVectorizationPlanner::tryToBlend(Instruction *I) {
+  PHINode *Phi = dyn_cast<PHINode>(I);
+  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
+    return nullptr;
+
+  return new VPBlendRecipe(Phi);
+}
+
 bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
                                           VFRange &Range) {
 
@@ -8243,11 +8350,21 @@
         continue;
       }
 
+      // Check if Instr is a memory operation that should be widened.
+      if ((Recipe = tryToWidenMemory(Instr, Range))) {
+        VPBB->appendRecipe(Recipe);
+        continue;
+      }
+
       // Check if Instr should form some PHI recipe.
       if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
         VPBB->appendRecipe(Recipe);
         continue;
       }
+      if ((Recipe = tryToBlend(Instr))) {
+        VPBB->appendRecipe(Recipe);
+        continue;
+      }
       if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
         VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
         continue;
Index: lib/Transforms/Vectorize/VPlan.h
===================================================================
--- lib/Transforms/Vectorize/VPlan.h
+++ lib/Transforms/Vectorize/VPlan.h
@@ -445,11 +445,13 @@
   /// SubclassID field of the VPRecipeBase objects. They are used for concrete
   /// type identification.
   typedef enum {
+    VPBlendSC,
     VPBranchOnMaskSC,
     VPInterleaveSC,
     VPPredInstPHISC,
     VPReplicateSC,
     VPWidenIntOrFpInductionSC,
+    VPWidenMemoryInstructionSC,
     VPWidenPHISC,
     VPWidenSC,
   } VPRecipeTy;
Index: test/Transforms/LoopVectorize/X86/x86-predication.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -58,3 +59,40 @@
   %tmp8 = phi i32 [ %tmp7, %for.inc ]
   ret i32 %tmp8
 }
+
+; This test ensures that a load, which would have been widened otherwise is
+; instead scalarized if Cost-Model so decided as part of its
+; sink-scalar-operands optimization for predicated instructions.
+;
+; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.udiv.continue:
+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %i7 = mul i64 %i, 777
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp4 = udiv i32 %tmp2, %x
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}