Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -568,6 +568,9 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) const;
 
+  /// Return true if target will expand (scalarize) this vector instruction.
+  bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) const;
+
   /// If target has efficient vector element load/store instructions, it can
   /// return true here so that insertion/extraction costs are not added to
   /// the scalarization cost of a load/store.
@@ -761,12 +764,16 @@
   /// \p Args is an optional argument which holds the instruction operands
   /// values so the TTI can analyze those values searching for special
   /// cases or optimizations based on those values.
+  /// \p Insert and Extracts arguments may be used to let it be known that in
+  /// the case target will scalarize this instruction, there is no need to do
+  /// insert and/or extract operations.
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
       OperandValueKind Opd2Info = OK_AnyValue,
       OperandValueProperties Opd1PropInfo = OP_None,
       OperandValueProperties Opd2PropInfo = OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) const;
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>()) const;
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -776,9 +783,10 @@
 
   /// \return The expected cost of cast instructions, such as bitcast, trunc,
   /// zext, etc. If there is an existing instruction that holds Opcode, it
-  /// may be passed in the 'I' parameter.
+  /// may be passed in the 'I' parameter. For Insert/Extract, see comment above.
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr) const;
+                       const Instruction *I = nullptr, bool Insert = true,
+                       bool Extract = true) const;
 
   /// \return The expected cost of a sign- or zero-extended vector extract. Use
   /// -1 to indicate that there is no information about the index value.
@@ -1073,6 +1081,7 @@
   getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
+  virtual bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
@@ -1115,11 +1124,13 @@
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
                          OperandValueProperties Opd2PropInfo,
-                         ArrayRef<const Value *> Args) = 0;
+                         ArrayRef<const Value *> Args,
+                         bool Insert, ArrayRef<bool> Extracts) = 0;
   virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                              Type *SubTp) = 0;
   virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                               const Instruction *I) = 0;
+                               const Instruction *I, bool Insert,
+                               bool Extract) = 0;
   virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                        VectorType *VecTy, unsigned Index) = 0;
   virtual int getCFInstrCost(unsigned Opcode) = 0;
@@ -1340,6 +1351,10 @@
     return Impl.getOperandsScalarizationOverhead(Args, VF);
   }
 
+  bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) override {
+    return Impl.isVecInstrExpanded(Opcode, VecTy);
+  }
+
   bool supportsEfficientVectorElementLoadStore() override {
     return Impl.supportsEfficientVectorElementLoadStore();
   }
@@ -1440,17 +1455,19 @@
                          OperandValueKind Opd2Info,
                          OperandValueProperties Opd1PropInfo,
                          OperandValueProperties Opd2PropInfo,
-                         ArrayRef<const Value *> Args) override {
+                         ArrayRef<const Value *> Args,
+                         bool Insert, ArrayRef<bool> Extracts) override {
     return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo, Args);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   }
   int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                      Type *SubTp) override {
     return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
   }
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I) override {
-    return Impl.getCastInstrCost(Opcode, Dst, Src, I);
+                       const Instruction *I, bool Insert,
+                       bool Extract) override {
+    return Impl.getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
   }
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index) override {
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -302,6 +302,8 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) { return 0; }
 
+  bool isVecInstrExpanded(unsigned Opcode, Type *VecTy) { return false; }
+
   bool supportsEfficientVectorElementLoadStore() { return false; }
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
@@ -405,7 +407,8 @@
                                   TTI::OperandValueKind Opd2Info,
                                   TTI::OperandValueProperties Opd1PropInfo,
                                   TTI::OperandValueProperties Opd2PropInfo,
-                                  ArrayRef<const Value *> Args) {
+                                  ArrayRef<const Value *> Args,
+                                  bool Insert, ArrayRef<bool> Extracts) {
     return 1;
   }
 
@@ -415,7 +418,8 @@
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                            const Instruction *I) { return 1; }
+                            const Instruction *I, bool Insert,
+                            bool Extract) { return 1; }
 
   unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                     VectorType *VecTy, unsigned Index) {
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -502,15 +502,24 @@
     return Cost;
   }
 
-  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
+  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args,
+              bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>()) {
     assert(VecTy->isVectorTy());
 
     unsigned Cost = 0;
 
-    Cost += getScalarizationOverhead(VecTy, true, false);
-    if (!Args.empty())
-      Cost += getOperandsScalarizationOverhead(Args,
+    if (Insert)
+      Cost += getScalarizationOverhead(VecTy, true, false);
+    if (!Args.empty()) {
+      SmallVector<const Value *, 4> VecArgs;
+      for (unsigned i = 0; i < Args.size(); i++) {
+        if (Extracts.size() > i && !Extracts[i])
+          continue;
+        VecArgs.push_back(Args[i]);
+      }
+      Cost += getOperandsScalarizationOverhead(VecArgs,
                                                VecTy->getVectorNumElements());
+    }
     else
       // When no information on arguments is provided, we add the cost
       // associated with one argument as a heuristic.
@@ -527,7 +536,8 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>()) {
     // Check if any of the operands are vector operands.
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -561,7 +571,7 @@
                           ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      return getScalarizationOverhead(Ty, Args) + Num * Cost;
+      return getScalarizationOverhead(Ty, Args, Insert, Extracts) + Num * Cost;
     }
 
     // We don't know anything about this scalar instruction.
@@ -585,7 +595,8 @@
   }
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                            const Instruction *I = nullptr) {
+                            const Instruction *I = nullptr,
+                            bool Insert = true, bool Extract = true) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
@@ -681,7 +692,8 @@
                                          Src->getVectorNumElements() / 2);
         T *TTI = static_cast<T *>(this);
         return TTI->getVectorSplitCost() +
-               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
+               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I,
+                                          Insert, Extract));
       }
 
       // In other cases where the source or destination are illegal, assume
@@ -692,7 +704,7 @@
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+      return getScalarizationOverhead(Dst, Insert, Extract) + Num * Cost;
     }
 
     // We already handled vector-to-vector and scalar-to-scalar conversions.
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -251,6 +251,11 @@
   return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
 }
 
+bool TargetTransformInfo::
+isVecInstrExpanded(unsigned Opcode, Type *VecTy) const {
+  return TTIImpl->isVecInstrExpanded(Opcode, VecTy);
+}
+
 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
@@ -435,9 +440,10 @@
     unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
     OperandValueProperties Opd2PropInfo,
-    ArrayRef<const Value *> Args) const {
+    ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) const {
   int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                             Opd1PropInfo, Opd2PropInfo, Args);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -450,10 +456,11 @@
 }
 
 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                 Type *Src, const Instruction *I) const {
+           Type *Src, const Instruction *I, bool Insert, bool Extract) const {
   assert ((I == nullptr || I->getOpcode() == Opcode) &&
           "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
+  int Cost =
+    TTIImpl->getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -111,7 +111,8 @@
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr);
+                       const Instruction *I = nullptr,
+                       bool Insert = true, bool Extract = true);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
@@ -124,7 +125,8 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -259,7 +259,7 @@
 }
 
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                     const Instruction *I) {
+                             const Instruction *I, bool Insert, bool Extract) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -287,7 +287,7 @@
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 
   static const TypeConversionCostTblEntry
   ConversionTbl[] = {
@@ -391,7 +391,7 @@
                                                  SrcTy.getSimpleVT()))
     return Entry->Cost;
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 }
 
 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -477,7 +477,8 @@
 int AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -496,7 +497,7 @@
   switch (ISD) {
   default:
     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                                Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   case ISD::SDIV:
     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
         Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -543,7 +544,7 @@
     }
 
     Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                          Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
     if (Ty->isVectorTy()) {
       // On AArch64, vector divisions are not supported natively and are
       // expanded into scalar divisions of each pair of elements.
@@ -552,7 +553,8 @@
       Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
                                      Opd2Info, Opd1PropInfo, Opd2PropInfo);
       // TODO: if one of the arguments is scalar, then it's not necessary to
-      // double the cost of handling the vector elements.
+      // double the cost of handling the vector elements. Note: this may be
+      // handled by implementing isVecInstrExpanded().
       Cost += Cost;
     }
     return Cost;
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -165,7 +165,8 @@
     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
     TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-    ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+    bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
 
   unsigned getCFInstrCost(unsigned Opcode);
 
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -335,11 +335,12 @@
 int GCNTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
   if (!OrigTy.isSimple()) {
     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   }
 
   // Legalize the type.
@@ -439,7 +440,7 @@
   }
 
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+                          Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 }
 
 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -146,7 +146,8 @@
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr);
+                       const Instruction *I = nullptr,
+                       bool Insert = true, bool Extract = true);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
@@ -162,7 +163,8 @@
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -134,7 +134,7 @@
 }
 
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                 const Instruction *I) {
+                            const Instruction *I, bool Insert, bool Extract) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -157,7 +157,7 @@
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
@@ -324,7 +324,7 @@
       return Entry->Cost;
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -478,7 +478,7 @@
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
-    ArrayRef<const Value *> Args) {
+    ArrayRef<const Value *> Args, bool Insert, ArrayRef<bool> Extracts) {
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -530,7 +530,7 @@
       return LT.first * Entry->Cost;
 
   int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                           Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 
   // This is somewhat of a hack. The problem that we are facing is that SROA
   // creates a sequence of shift, and, or instructions to construct values.
Index: lib/Target/Hexagon/HexagonTargetTransformInfo.h
===================================================================
--- lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -131,9 +131,11 @@
             TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
             TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
             TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-            ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+            ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+            bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-            const Instruction *I = nullptr);
+                            const Instruction *I = nullptr,
+                            bool Insert = true, bool Extract = true);
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   unsigned getCFInstrCost(unsigned Opcode) {
Index: lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
===================================================================
--- lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -226,18 +226,19 @@
 unsigned HexagonTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
       TTI::OperandValueProperties Opd1PropInfo,
-      TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args) {
+      TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value*> Args,
+      bool Insert, ArrayRef<bool> Extracts) {
   if (Ty->isVectorTy()) {
     std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
     if (LT.second.isFloatingPoint())
       return LT.first + FloatFactor * getTypeNumElements(Ty);
   }
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                       Opd1PropInfo, Opd2PropInfo, Args);
+                          Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 }
 
 unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
-      Type *SrcTy, const Instruction *I) {
+      Type *SrcTy, const Instruction *I, bool Insert, bool Extract) {
   if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
     unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
     unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
Index: lib/Target/Lanai/LanaiTargetTransformInfo.h
===================================================================
--- lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -82,13 +82,14 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>()) {
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
     switch (ISD) {
     default:
       return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                           Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
     case ISD::MUL:
     case ISD::SDIV:
     case ISD::UDIV:
Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h
===================================================================
--- lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -92,7 +92,8 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,8 @@
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -124,7 +125,7 @@
   switch (ISD) {
   default:
     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+                         Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -137,7 +138,7 @@
       return 2 * LT.first;
     // Delegate other cases to the basic TTI.
     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+                          Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
   }
 }
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -77,10 +77,11 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr);
+       const Instruction *I = nullptr, bool Insert = true, bool Extract = true);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -327,12 +327,13 @@
 int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                       Opd1PropInfo, Opd2PropInfo);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 }
 
 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -349,10 +350,10 @@
 }
 
 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                 const Instruction *I) {
+                        const Instruction *I, bool Insert, bool Extract) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 }
 
 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -70,18 +70,21 @@
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool isVecInstrExpanded(unsigned Opcode, Type *VecTy);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr);
+                       const Instruction *I = nullptr,
+                       bool Insert = true, bool Extract = true);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -347,12 +347,28 @@
   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
 }
 
+bool SystemZTTIImpl::isVecInstrExpanded(unsigned Opcode, Type *VecTy) {
+  assert(VecTy->isVectorTy() || VecTy->isVoidTy());
+  // It seems these opcodes translate to expanded vector DAG nodes here, but
+  // they are in fact not.
+  if ((Opcode == Instruction::Select) || (Opcode == Instruction::SExt) ||
+      (Opcode == Instruction::ZExt) || (Opcode == Instruction::Trunc))
+    return false;
+  const TargetLoweringBase *TLI = getTLI();
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  if (!ISD)
+    return false;
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+  return TLI->isOperationExpand(ISD, LT.second);
+}
+
 int SystemZTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
-    ArrayRef<const Value *> Args) {
+    ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
 
   // TODO: return a good value for BB-VECTORIZER that includes the
   // immediate loads, which we do not want to count for the loop
@@ -408,7 +424,8 @@
     if (DivRemConstPow2)
       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
     if (DivRemConst)
-      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+      return VF * DivMulSeqCost +
+        getScalarizationOverhead(Ty, Args, Insert, Extracts);
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
       // division/remainder, which will get scalarized and handled with
@@ -431,7 +448,8 @@
         // inserting and extracting the values.
         unsigned ScalarCost =
             getArithmeticInstrCost(Opcode, Ty->getScalarType());
-        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        unsigned Cost = (VF * ScalarCost) +
+            getScalarizationOverhead(Ty, Args, Insert, Extracts);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -448,7 +466,8 @@
 
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
-      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      unsigned Cost = (VF * LIBCALL_COST) +
+        getScalarizationOverhead(Ty, Args, Insert, Extracts);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -494,7 +513,7 @@
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
-                                       Opd1PropInfo, Opd2PropInfo, Args);
+                           Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 }
 
 int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -636,7 +655,7 @@
 }
 
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                     const Instruction *I) {
+                             const Instruction *I, bool Insert, bool Extract) {
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
   unsigned SrcScalarBits = Src->getScalarSizeInBits();
 
@@ -696,16 +715,15 @@
       unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
                                              Src->getScalarType());
       unsigned TotCost = VF * ScalarCost;
-      bool NeedsInserts = true, NeedsExtracts = true;
       // FP128 registers do not get inserted or extracted.
       if (DstScalarBits == 128 &&
           (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
-        NeedsInserts = false;
+        Insert = false;
       if (SrcScalarBits == 128 &&
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
-        NeedsExtracts = false;
+        Extract = false;
 
-      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+      TotCost += getScalarizationOverhead(Dst, Insert, Extract);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -716,7 +734,8 @@
 
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
-        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+        return VF /*ldxbr/lexbr*/ +
+          getScalarizationOverhead(Dst, Insert, false);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -729,7 +748,7 @@
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(Src, false, true);
+      return VF + getScalarizationOverhead(Src, false, Extract);
     }
   }
   else { // Scalar
@@ -758,7 +777,7 @@
     }
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 }
 
 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
===================================================================
--- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -62,7 +62,8 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
Index: lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
===================================================================
--- lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,8 @@
 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
 
   unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
       Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -68,10 +68,11 @@
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
-      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+      bool Insert = true, ArrayRef<bool> Extracts = ArrayRef<bool>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                       const Instruction *I = nullptr);
+       const Instruction *I = nullptr, bool Insert = true, bool Extract = true);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -174,7 +174,8 @@
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
-    ArrayRef<const Value *> Args) {
+    ArrayRef<const Value *> Args,
+    bool Insert, ArrayRef<bool> Extracts) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -853,7 +854,8 @@
   }
 
   // Fallback to the default implementation.
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                         Opd1PropInfo, Opd2PropInfo, Args, Insert, Extracts);
 }
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -1194,7 +1196,7 @@
 }
 
 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
-                                 const Instruction *I) {
+                          const Instruction *I, bool Insert, bool Extract) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -1566,7 +1568,7 @@
       return Entry->Cost;
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I, Insert, Extract);
 }
 
 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1057,6 +1057,7 @@
     setCostBasedWideningDecision(VF);
     collectLoopUniforms(VF);
     collectLoopScalars(VF);
+    collectTargetScalarized(VF);
   }
 
   /// Returns true if the target machine supports masked store operation
@@ -1246,6 +1247,8 @@
   /// The data is collected per VF.
   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
 
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> TargetScalarized;
+
   /// Holds the instructions (address computations) that are forced to be
   /// scalarized.
   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
@@ -1276,6 +1279,11 @@
   /// iteration of the original scalar loop.
   void collectLoopScalars(unsigned VF);
 
+  void collectTargetScalarized(unsigned VF);
+
+  bool isTargetScalarizedIns(const Value *V, unsigned VF);
+  bool hasOnlyTargetScalarizedUses(const Instruction *I, unsigned VF);
+
   /// Keeps cost model vectorization decision and cost for instructions.
   /// Right now it is used for memory instructions only.
   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
@@ -4296,6 +4304,44 @@
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
+void LoopVectorizationCostModel::collectTargetScalarized(unsigned VF) {
+  assert(VF >= 2 && TargetScalarized.find(VF) == TargetScalarized.end() &&
+         "This function should not be visited twice for the same VF");
+
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB) {
+      Type *ScalarTy = I.getType();
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+        ScalarTy = SI->getValueOperand()->getType();
+      Type *VecTy = ToVectorTy(ScalarTy, VF);
+      if (TTI.isVecInstrExpanded(I.getOpcode(), VecTy))
+        TargetScalarized[VF].insert(&I);
+    }
+}
+
+bool LoopVectorizationCostModel::
+isTargetScalarizedIns(const Value *V, unsigned VF) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    if (!TheLoop->contains(I))
+      // Assume extraction is done in preheader.
+      return true;
+    if (TargetScalarized[VF].find(I) != TargetScalarized[VF].end())
+      return true;
+  }
+  return false;
+}
+
+bool LoopVectorizationCostModel::
+hasOnlyTargetScalarizedUses(const Instruction *I, unsigned VF) {
+  for (const Use &U : I->uses()) {
+    const Instruction *UI = cast<Instruction>(U.getUser());
+    if (TargetScalarized[VF].find(UI) == TargetScalarized[VF].end() &&
+        TheLoop->contains(UI))
+      return false;
+  }
+  return true;
+}
+
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
   if (!blockNeedsPredication(I->getParent()))
     return false;
@@ -5815,12 +5861,18 @@
         TTI.getOperandInfo(Op2, Op2VP);
     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
       Op2VK = TargetTransformInfo::OK_UniformValue;
-
     SmallVector<const Value *, 4> Operands(I->operand_values());
+    bool Insert = true;
+    SmallVector<bool, 4> Extracts(Operands.size(), true);
+    if (VF > 1 && isTargetScalarizedIns(I, VF)) {
+      Insert = !hasOnlyTargetScalarizedUses(I, VF);
+      for (unsigned i = 0; i < Operands.size(); i++)
+        Extracts[i] = !isTargetScalarizedIns(Operands[i], VF);
+    }
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
     return N * TTI.getArithmeticInstrCost(
-                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
-                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
+             I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, Op2VK,
+             TargetTransformInfo::OP_None, Op2VP, Operands, Insert, Extracts);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -5897,8 +5949,15 @@
       }
     }
 
+    bool Insert = true;
+    bool Extract = true;
+    if (VF > 1 && isTargetScalarizedIns(I, VF)) {
+      Insert = !hasOnlyTargetScalarizedUses(I, VF);
+      Extract = !isTargetScalarizedIns(I->getOperand(0), VF);
+    }
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
-    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I,
+                                    Insert, Extract);
   }
   case Instruction::Call: {
     bool NeedToScalarize;