Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -568,6 +568,10 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) const;
 
+  /// Return true if target will expand (scalarize) the vector Opcode, and it
+  /// would be preferred to scalarized a memory access connected with it.
+  bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) const;
+
   /// If target has efficient vector element load/store instructions, it can
   /// return true here so that insertion/extraction costs are not added to
   /// the scalarization cost of a load/store.
@@ -1078,6 +1082,8 @@
   getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
+  virtual bool
+  preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
@@ -1346,6 +1352,11 @@
     return Impl.getOperandsScalarizationOverhead(Args, VF);
   }
 
+  bool preferScalarizedMemFor(bool Stores, unsigned Opcode,
+                              Type *VecTy) override {
+    return Impl.preferScalarizedMemFor(Stores, Opcode, VecTy);
+  }
+
   bool supportsEfficientVectorElementLoadStore() override {
     return Impl.supportsEfficientVectorElementLoadStore();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -303,6 +303,10 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) { return 0; }
 
+  bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) {
+    return false;
+  }
+
   bool supportsEfficientVectorElementLoadStore() { return false; }
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -251,6 +251,11 @@
   return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
 }
 
+bool TargetTransformInfo::
+preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) const {
+  return TTIImpl->preferScalarizedMemFor(Stores, Opcode, VecTy);
+}
+
 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -70,6 +70,7 @@
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -347,6 +347,27 @@
   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
 }
 
+bool SystemZTTIImpl::
+preferScalarizedMemFor(bool Stores, unsigned Opcode, Type *VecTy) {
+  assert(VecTy->isVectorTy());
+  if (Stores && getScalarSizeInBits(VecTy) != 64)
+    return false;
+  if (!Stores && VecTy->isFPOrFPVectorTy())
+    return false;
+
+  // It seems these opcodes translate to expanded vector DAG nodes here, but
+  // they are in fact not.
+  if ((Opcode == Instruction::Select) || (Opcode == Instruction::SExt) ||
+      (Opcode == Instruction::ZExt) || (Opcode == Instruction::Trunc))
+    return false;
+  const TargetLoweringBase *TLI = getTLI();
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (!ISD)
+    return false;
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+  return TLI->isOperationExpand(ISD, LT.second);
+}
+
 int SystemZTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1105,6 +1105,8 @@
     return isScalarWithPredication(I);
   }
 
+  bool isScalarWithoutPredication(Instruction *I, unsigned VF);
+
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
@@ -1166,14 +1168,16 @@
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF,
+                                         bool InPassedVF = true);
 
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
-  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
+  unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy,
+                              bool InPassedVF);
 
   /// Calculate vectorization cost of memory instruction \p I.
-  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF, bool InPassedVF = true);
 
   /// The cost computation for scalarized memory instruction.
   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
@@ -1263,6 +1267,9 @@
   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
                               unsigned VF);
 
+  int computeInstDiscount(Instruction *Inst, ScalarCostsTy &ScalarCosts,
+                          unsigned VF);
+
   /// Collect the instructions that are uniform after vectorization. An
   /// instruction is uniform if we represent it with a single scalar value in
   /// the vectorized loop corresponding to each vector iteration. Examples of
@@ -4378,6 +4385,21 @@
   return false;
 }
 
+bool LoopVectorizationCostModel::isScalarWithoutPredication(Instruction *I,
+                                                            unsigned VF) {
+  if (isa<LoadInst>(I) || isa<StoreInst>(I))
+    return false;
+
+  // A TTI query could be used to ask target if the instruction will be
+  // expanded during instruction selection, but it should be equivalent to
+  // find a vector cost greater than the sum of the scalar costs.
+  assert(VF > 1 && "Should only be called with VF > 1.");
+  unsigned ScalarCost =
+    getInstructionCost(I, 1, false/*InPassedVF*/).first * VF;
+  unsigned VectorCost = getInstructionCost(I, VF).first;
+  return (ScalarCost < VectorCost);
+}
+
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
@@ -5253,6 +5275,21 @@
         PredicatedBBsAfterVectorization.insert(BB);
       }
   }
+
+  // Find sequences of non predicated instructions that have lower cost if
+  // scalarized. This would typically mean instructions that the target has
+  // no native vector support for.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (blockNeedsPredication(BB))
+      continue;
+    for (Instruction &I : *BB)
+      if (!isScalarAfterVectorization(&I, VF) && !ScalarCostsVF.count(&I) &&
+          isScalarWithoutPredication(&I, VF)) {
+        ScalarCostsTy ScalarCosts;
+        if (computeInstDiscount(&I, ScalarCosts, VF) >= 0)
+          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+      }
+  }
 }
 
 int LoopVectorizationCostModel::computePredInstDiscount(
@@ -5370,6 +5407,71 @@
   return Discount;
 }
 
+int LoopVectorizationCostModel::computeInstDiscount(
+    Instruction *Inst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+    unsigned VF) {
+
+  // Initialize the discount to zero, meaning that the scalar version and the
+  // vector version cost the same.
+  int Discount = 0;
+
+  auto needsExtract = [&](Instruction *I) -> bool {
+    if (isa<LoadInst>(I))
+      return getWideningDecision(I, VF) != CM_Scalarize;
+    return (TheLoop->contains(I) && !isScalarAfterVectorization(I, VF) &&
+            !ScalarCosts.count(I));
+  };
+
+  // Holds instructions to analyze. Those instructions are the ones that
+  // would be scalarized if we find that the scalar version costs less.
+  SmallVector<Instruction *, 8> Worklist;
+  Worklist.push_back(Inst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    unsigned VectorCost = getInstructionCost(I, VF).first;
+    unsigned ScalarCost =
+      VF * getInstructionCost(I, 1, false/*InPassedVF*/).first;
+
+    // Operands
+    for (Use &U : I->operands())
+      if (auto *J = dyn_cast<Instruction>(U.get())) {
+        assert(VectorType::isValidElementType(J->getType()) &&
+               "Instruction has non-scalar type");
+        if (needsExtract(J))
+          ScalarCost += TTI.getScalarizationOverhead(
+              ToVectorTy(J->getType(), VF), false, true /*Extract*/);
+      }
+
+    // Users
+    SmallVector<Instruction *, 8> Tmp_worklist;
+    bool VectorUse = false;
+    for (User *U : I->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (isScalarWithoutPredication(UI, VF))
+        Tmp_worklist.push_back(UI);
+      else if (isa<StoreInst>(UI) &&
+               getWideningDecision(UI, VF) == CM_Scalarize)
+        continue;
+      else {
+        VectorUse = true;
+        break;
+      }
+    }
+    if (!VectorUse)
+      Worklist.append(Tmp_worklist.begin(), Tmp_worklist.end());
+    else
+      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+                                                 true /*Insert*/, false);
+
+    // Compute the discount. A non-negative discount means the vector version
+    // of the instruction costs more, and scalarizing would be beneficial.
+    Discount += VectorCost - ScalarCost;
+    ScalarCosts[I] = ScalarCost;
+  }
+
+  return Discount;
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
@@ -5587,7 +5689,8 @@
 }
 
 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
-                                                              unsigned VF) {
+                                                              unsigned VF,
+                                                              bool InPassedVF) {
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
   if (VF == 1) {
@@ -5596,13 +5699,15 @@
     unsigned AS = getLoadStoreAddressSpace(I);
 
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
+                               (InPassedVF ? I : nullptr));
   }
   return getWideningCost(I, VF);
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF,
+                                               bool InPassedVF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
@@ -5620,7 +5725,7 @@
   }
 
   Type *VectorTy;
-  unsigned C = getInstructionCost(I, VF, VectorTy);
+  unsigned C = getInstructionCost(I, VF, VectorTy, InPassedVF);
 
   bool TypeNotScalarized =
       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
@@ -5659,16 +5764,61 @@
         continue;
       }
 
-      // We assume that widening is the best solution when possible.
-      if (memoryInstructionCanBeWidened(&I, VF)) {
-        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+      // Find cases where scalarization is better than widening +
+      // scalarization overhead.
+      unsigned const WidenCost = memoryInstructionCanBeWidened(&I, VF)
+                                     ? getConsecutiveMemOpCost(&I, VF)
+                                     : UINT_MAX;
+      bool LoadToScalarized = isa<LoadInst>(&I);
+      if (LoadToScalarized)
+        for (User *U : I.users())
+          if (!TTI.preferScalarizedMemFor(false/*Stores*/,
+                                          cast<Instruction>(U)->getOpcode(),
+                                          ToVectorTy(I.getType(), VF))) {
+            LoadToScalarized = false;
+            break;
+          }
+      bool StoreOfScalarized = false;
+      if (isa<StoreInst>(&I))
+        if (auto *OpAsIns = dyn_cast<Instruction>(I.getOperand(0)))
+          if (TTI.preferScalarizedMemFor(true/*Stores*/,
+                                         OpAsIns->getOpcode(),
+                                         ToVectorTy(OpAsIns->getType(), VF)))
+            StoreOfScalarized = true;
+      if (LoadToScalarized || StoreOfScalarized) {
+        // Assume that scalarization is best unless cost functions give a
+        // smaller sum for widening + overhead.
+        unsigned WidenPlusOverheadCost = WidenCost;
+        if (WidenCost < UINT_MAX) {
+          if (LoadToScalarized) {
+            WidenPlusOverheadCost += TTI.getScalarizationOverhead(
+                ToVectorTy(I.getType(), VF), false, true /*Extract*/);
+          } else if (StoreOfScalarized) {
+            WidenPlusOverheadCost += TTI.getScalarizationOverhead(
+                ToVectorTy(I.getOperand(0)->getType(), VF), true /*Insert*/,
+                false);
+          }
+        }
+        unsigned ScalarizationCost =
+          VF * getInstructionCost(&I, 1, false/*InPassedVF*/).first;
+        if (ScalarizationCost < WidenPlusOverheadCost) {
+          // The cost does not include scalarization overhead, since the
+          // user(s)/def will be scalarized as will be recognized in
+          // computeInstDiscount() later.
+          setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
+          continue;
+        }
+      }
+
+      if (WidenCost < UINT_MAX) {
+        // Assume now that widening is the best solution when possible.
         int ConsecutiveStride =
-               Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+            Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
-        setWideningDecision(&I, VF, Decision, Cost);
+        setWideningDecision(&I, VF, Decision, WidenCost);
         continue;
       }
 
@@ -5781,7 +5931,8 @@
 
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                         unsigned VF,
-                                                        Type *&VectorTy) {
+                                                        Type *&VectorTy,
+                                                        bool InPassedVF) {
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -5915,7 +6066,8 @@
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
+                                  (InPassedVF ? I : nullptr));
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -5924,7 +6076,8 @@
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
+                                  (InPassedVF ? I : nullptr));
   }
   case Instruction::Store:
   case Instruction::Load: {
@@ -5937,7 +6090,7 @@
         Width = 1;
     }
     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
-    return getMemoryInstructionCost(I, VF);
+    return getMemoryInstructionCost(I, VF, InPassedVF);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -5983,7 +6136,8 @@
     }
 
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
-    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
+                                    (InPassedVF ? I : nullptr));
   }
   case Instruction::Call: {
     bool NeedToScalarize;