Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -575,9 +575,11 @@
   /// Phi, Ret, Br.
   int getCFInstrCost(unsigned Opcode) const;
 
-  /// \returns The expected cost of compare and select instructions.
+  /// \returns The expected cost of compare and select instructions. If there
+  /// is an existing instruction that holds Opcode, it may be passed in the
+  /// 'I' parameter.
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                         Type *CondTy = nullptr) const;
+                 Type *CondTy = nullptr, const Instruction *I = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
@@ -809,7 +811,7 @@
                                        VectorType *VecTy, unsigned Index) = 0;
   virtual int getCFInstrCost(unsigned Opcode) = 0;
   virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                 Type *CondTy) = 0;
+                                Type *CondTy, const Instruction *I) = 0;
   virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
                                  unsigned Index) = 0;
   virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@@ -1055,8 +1057,9 @@
   int getCFInstrCost(unsigned Opcode) override {
     return Impl.getCFInstrCost(Opcode);
   }
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) override {
-    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I) override {
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   }
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
     return Impl.getVectorInstrCost(Opcode, Val, Index);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -336,7 +336,8 @@
 
   unsigned getCFInstrCost(unsigned Opcode) { return 1; }
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
     return 1;
   }
 
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -319,6 +319,23 @@
     return Cost;
   }
 
+  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
+    assert (VecTy->isVectorTy());
+    
+    unsigned Cost = 0;
+
+    Cost += getScalarizationOverhead(VecTy, true, false);
+    if (!Args.empty())
+      Cost += getOperandsScalarizationOverhead(Args,
+                                               VecTy->getVectorNumElements());
+    else
+      // When no information on arguments is provided, we add the cost
+      // associated with one argument as a heuristic.
+      Cost += getScalarizationOverhead(VecTy, false, true);
+
+    return Cost;
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
 
   unsigned getArithmeticInstrCost(
@@ -361,15 +378,7 @@
                           ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      unsigned TotCost = getScalarizationOverhead(Ty, true, false) + Num * Cost;
-      if (!Args.empty())
-        TotCost += getOperandsScalarizationOverhead(Args, Num);
-      else
-        // When no information on arguments is provided, we add the cost
-        // associated with one argument as a heuristic.
-        TotCost += getScalarizationOverhead(Ty, false, true);
-
-      return TotCost;
+      return getScalarizationOverhead(Ty, Args) + Num * Cost;
     }
 
     // We don't know anything about this scalar instruction.
@@ -512,7 +521,8 @@
     return 0;
   }
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
@@ -540,7 +550,7 @@
       if (CondTy)
         CondTy = CondTy->getScalarType();
       unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
-          Opcode, ValTy->getScalarType(), CondTy);
+          Opcode, ValTy->getScalarType(), CondTy, I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -447,12 +447,12 @@
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
     Type *CondTy = SI->getCondition()->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
   }
   case Instruction::Store: {
     const StoreInst *SI = cast<StoreInst>(I);
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -329,8 +329,8 @@
 }
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) const {
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+                                 Type *CondTy, const Instruction *I) const {
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -103,7 +103,8 @@
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I);
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -436,7 +436,7 @@
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy) {
+                                       Type *CondTy, const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -463,7 +463,7 @@
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -96,7 +96,8 @@
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I);
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -310,7 +310,8 @@
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
@@ -335,7 +336,7 @@
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -75,7 +75,8 @@
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -308,8 +308,9 @@
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -347,9 +347,13 @@
     // There should be no need to check for float types other than v2f64
     // since <2 x f32> isn't a legal type.
     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
   }
 
   // Handle floating-point types.
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@
   const SystemZSubtarget *getST() const { return ST; }
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
+  unsigned const LIBCALL_COST = 30;
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -53,6 +55,18 @@
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
+  bool isFPVectorizationPotentiallyUnsafe() { return false; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I);
   /// @}
 };
 
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -259,11 +259,17 @@
         }
       }
       if (isa<StoreInst>(&I)) {
-        NumStores++;
         Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
+        if ((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
            (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+          NumStores += 2;  // 128 bit fp/int stores get split.
+        else if (MemAccessTy->isVectorTy()) {
+          unsigned NumExpandedStores =
+            std::max(1U, MemAccessTy->getPrimitiveSizeInBits() / 128);
+          NumStores += NumExpandedStores;
+        }
+        else
+          NumStores++;
       }
     }
 
@@ -313,3 +319,407 @@
   return 0;
 }
 
+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  if (Ty->isVectorTy()) {
+    unsigned VF = Ty->getVectorNumElements();
+    assert (VF <= 16 && "VF greater than 16?");
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned NumVectors = std::max(1U, Ty->getPrimitiveSizeInBits() / 128);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr || Opcode == Instruction::Or) {
+      return NumVectors;
+    }
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      return (ScalarBits < 32 ? 4 /*sext of ops*/ : 2);
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      return (ScalarBits < 32 ? 4 /*zext of both ops*/ : 3);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+unsigned getLog2Diff(unsigned Bits0, unsigned Bits1) {
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    assert (VF <= 16 && "VF greater than 16?");
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    unsigned SrcVectorBits = Src->getPrimitiveSizeInBits();
+    unsigned NumDstVectors = std::max(1U, Dst->getPrimitiveSizeInBits() / 128);
+
+    if (Opcode == Instruction::Trunc) {
+      if (SrcVectorBits <= 256)
+        // Up to 2 vector registers can be truncated efficiently with pack or
+        // permute. The latter requires an immediate mask to be loaded, which
+        // hopefully gets hoisted to outside the loop.
+        return 1;
+
+      else if (VF == 8) // Src is <8 x i64>
+        // Requires several pack / permutes.
+        return (DstScalarBits == 32 ? 2 : 3);
+
+      else { // 16 vector elements
+        // Requires multiple pack instructions
+        if (SrcScalarBits == 32)
+          return (DstScalarBits == 16 ? 2 : 3);
+        else //  Src is <16 x i64>
+          return (DstScalarBits == 32 ? 4 : (DstScalarBits == 16 ? 6 : 7));
+      }
+
+      // TODO: return a good value for BB-VECTORIZER that includes the
+      // immediate loads, which we do not want to count for the loop
+      // vectorizer, since they are hopefully hoisted out of the loop. This
+      // would require a new parameter 'InLoop', but not sure if constant
+      // args are common enough to motivate this.
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        // For types that spans multiple vector registers, some additional
+        // vector operations are needed.
+        unsigned NumUnpacks =
+          getLog2Diff(Src->getScalarType()->getPrimitiveSizeInBits(),
+                      Dst->getScalarType()->getPrimitiveSizeInBits());
+        return (NumUnpacks * NumDstVectors) + (NumDstVectors - 1);
+      }
+      else if (Src->isIntegerTy(1)) {
+        // FIXME: i1 isn't optimally treated.
+        // These values reflect the current handling of i1 for sext/zext.
+        if (Opcode == Instruction::SExt) {
+          static const CostTblEntry SextCostTable[] = {
+            { ISD::SIGN_EXTEND, MVT::v2i8, 3},
+            { ISD::SIGN_EXTEND, MVT::v2i16, 3},
+            { ISD::SIGN_EXTEND, MVT::v2i32, 3},
+            { ISD::SIGN_EXTEND, MVT::v2i64, 2},
+            { ISD::SIGN_EXTEND, MVT::v4i8, 3},
+            { ISD::SIGN_EXTEND, MVT::v4i16, 3},
+            { ISD::SIGN_EXTEND, MVT::v4i32, 2},
+            { ISD::SIGN_EXTEND, MVT::v4i64, 6},
+            { ISD::SIGN_EXTEND, MVT::v8i8, 3},
+            { ISD::SIGN_EXTEND, MVT::v8i16, 2},
+            { ISD::SIGN_EXTEND, MVT::v8i32, 6},
+            { ISD::SIGN_EXTEND, MVT::v8i64, 13},
+            { ISD::SIGN_EXTEND, MVT::v16i8, 2},
+            { ISD::SIGN_EXTEND, MVT::v16i16, 6},
+            { ISD::SIGN_EXTEND, MVT::v16i32, 12},
+            { ISD::SIGN_EXTEND, MVT::v16i64, 23},
+          };
+          MVT MTy = TLI->getValueType(DL, Dst).getSimpleVT();
+          if (const auto *Entry =
+              CostTableLookup(SextCostTable, ISD::SIGN_EXTEND, MTy))
+            return Entry->Cost;
+        }
+        else { // ZExt
+          static const CostTblEntry ZextCostTable[] = {
+            { ISD::ZERO_EXTEND, MVT::v2i8, 2},
+            { ISD::ZERO_EXTEND, MVT::v2i16, 2},
+            { ISD::ZERO_EXTEND, MVT::v2i32, 2},
+            { ISD::ZERO_EXTEND, MVT::v2i64, 1},
+            { ISD::ZERO_EXTEND, MVT::v4i8, 2},
+            { ISD::ZERO_EXTEND, MVT::v4i16, 2},
+            { ISD::ZERO_EXTEND, MVT::v4i32, 1},
+            { ISD::ZERO_EXTEND, MVT::v4i64, 4},
+            { ISD::ZERO_EXTEND, MVT::v8i8, 2},
+            { ISD::ZERO_EXTEND, MVT::v8i16, 1},
+            { ISD::ZERO_EXTEND, MVT::v8i32, 4},
+            { ISD::ZERO_EXTEND, MVT::v8i64, 12},
+            { ISD::ZERO_EXTEND, MVT::v16i8, 1},
+            { ISD::ZERO_EXTEND, MVT::v16i16, 4},
+            { ISD::ZERO_EXTEND, MVT::v16i32, 12},
+            { ISD::ZERO_EXTEND, MVT::v16i64, 32},
+          };
+          MVT MTy = TLI->getValueType(DL, Dst).getSimpleVT();
+          if (const auto *Entry =
+              CostTableLookup(ZextCostTable, ISD::ZERO_EXTEND, MTy))
+            return Entry->Cost;
+        }
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits <= 64 ?
+              (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/) : LIBCALL_COST);
+    
+    if (Opcode == Instruction::SExt && Src->isIntegerTy(1))
+      // nilf/risbgn + lcr/lcgr
+      return 2;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+static Type *ToVectorTy(Type *T, unsigned VF) {
+  if (!T->isVectorTy() && VF > 1)
+    return VectorType::get(T, VF);
+  return T;
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  
+  // Hand over to common code if it's a compare for branch.
+  if (I != nullptr && I->hasOneUse() &&
+      isa<BranchInst>(I->use_begin()->getUser()))
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+
+  if (ValTy->isVectorTy()) {
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      Type *SelectedTy = nullptr;
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        assert (isa<CmpInst>(I));
+        if (I->hasOneUse()) {  // FIXME: Need to handle several users?
+          if (SelectInst *SI = dyn_cast<SelectInst>(I->use_begin()->getUser()))
+            SelectedTy = ToVectorTy(SI->getType(), VF);
+        }
+
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp =
+        std::max(1U, ValTy->getPrimitiveSizeInBits() / 128);
+      unsigned NumVecs_sel = (SelectedTy != nullptr ?
+        std::max(1U, SelectedTy->getPrimitiveSizeInBits() / 128) : 1);
+
+      // If the vector select is split, one compare will be done for each part.
+      unsigned Cost = (std::max(NumVecs_cmp, NumVecs_sel) *
+                       (CmpCostPerVector + PredicateExtraCost));
+
+      // In case the select gets split, and the compared element type is
+      // smaller than the selected one,extra instructions are needed to move
+      // the values into the operands for the compares.
+      if (SelectedTy != nullptr && NumVecs_sel > 1 && NumVecs_cmp < NumVecs_sel) {
+        Cost += NumVecs_sel;
+        if (NumVecs_sel == 4)
+          Cost += (ValTy->getScalarSizeInBits() < 32 ? 3 : 2);
+        else if (NumVecs_sel == 8)
+          Cost += 6;
+      }
+
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      unsigned NumVecs_sel = 
+        std::max(1U, ValTy->getPrimitiveSizeInBits() / 128);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      if (I != nullptr) {
+        assert (isa<SelectInst>(I));
+
+        Type *ComparedTy = nullptr;
+        if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+          ComparedTy = ToVectorTy(CI->getOperand(0)->getType(), VF);
+
+        if (ComparedTy != nullptr) {
+          unsigned NumVecs_cmp =
+            std::max(1U, ComparedTy->getPrimitiveSizeInBits() / 128);
+          unsigned SelScalarBits =
+            ValTy->getScalarType()->getPrimitiveSizeInBits();
+          unsigned CmpScalarBits =
+            ComparedTy->getScalarType()->getPrimitiveSizeInBits();
+          unsigned Log2Diff = getLog2Diff(SelScalarBits, CmpScalarBits);
+          unsigned PacksPerVector = Log2Diff;
+          if (Log2Diff > 1 && NumVecs_sel <= 2 && NumVecs_cmp <= 2 &&
+              CmpScalarBits > SelScalarBits)
+            PacksPerVector = 1; // permute used instead.
+
+          // More work is done with very different element types and high
+          // vectorization factors.
+          if (Log2Diff > 1 && NumVecs_cmp > 2)
+            PacksPerVector += ((Log2Diff - 1) * (NumVecs_cmp / 4));
+
+          PackCost = PacksPerVector * NumVecs_sel;
+
+          if (CmpScalarBits == 64 && SelScalarBits == 16 && VF == 16)
+            PackCost -= 2; // Minor adjustment
+        }
+      }
+
+      return NumVecs_sel /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -62,7 +62,8 @@
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1300,7 +1300,8 @@
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1366,7 +1367,7 @@
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1351,90 +1351,135 @@
   if (RHSShuffle && RHSOp0Width == LHSWidth) {
     newRHS = RHSOp0;
   }
+
   // case 4
-  if (LHSOp0 == RHSOp0) {
+  bool EqOp0s = (LHSOp0 == RHSOp0);
+  if (!EqOp0s && LHSOp0 != nullptr && RHSOp0 != nullptr) {
+    if (Instruction* LHSOp0Inst = dyn_cast<Instruction>(LHSOp0)) {
+      if (Instruction* RHSOp0Inst = dyn_cast<Instruction>(RHSOp0)) {
+        if (LHSOp0Inst->isIdenticalTo(RHSOp0Inst) &&
+            !LHSOp0Inst->mayHaveSideEffects() && !LHSOp0Inst->mayReadFromMemory() &&
+            !RHSOp0Inst->mayHaveSideEffects() && !RHSOp0Inst->mayReadFromMemory()) {
+          EqOp0s = true;
+        }
+      }
+    }
+  }
+  if (EqOp0s) {
     newLHS = LHSOp0;
     newRHS = nullptr;
   }
 
-  if (newLHS == LHS && newRHS == RHS)
-    return MadeChange ? &SVI : nullptr;
-
-  SmallVector<int, 16> LHSMask;
-  SmallVector<int, 16> RHSMask;
-  if (newLHS != LHS)
-    LHSMask = LHSShuffle->getShuffleMask();
-  if (RHSShuffle && newRHS != RHS)
-    RHSMask = RHSShuffle->getShuffleMask();
-
-  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
   SmallVector<int, 16> newMask;
-  bool isSplat = true;
-  int SplatElt = -1;
-  // Create a new mask for the new ShuffleVectorInst so that the new
-  // ShuffleVectorInst is equivalent to the original one.
-  for (unsigned i = 0; i < VWidth; ++i) {
-    int eltMask;
-    if (Mask[i] < 0) {
-      // This element is an undef value.
-      eltMask = -1;
-    } else if (Mask[i] < (int)LHSWidth) {
-      // This element is from left hand side vector operand.
-      //
-      // If LHS is going to be replaced (case 1, 2, or 4), calculate the
-      // new mask value for the element.
-      if (newLHS != LHS) {
-        eltMask = LHSMask[Mask[i]];
-        // If the value selected is an undef value, explicitly specify it
-        // with a -1 mask value.
-        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+
+  if (newLHS == LHS && newRHS == RHS) {
+    if (LHSShuffle != nullptr && RHSShuffle != nullptr) {
+      SmallVector<int, 16> LHSMask;
+      SmallVector<int, 16> RHSMask;
+      LHSMask = LHSShuffle->getShuffleMask();
+      RHSMask = RHSShuffle->getShuffleMask();
+      unsigned LHSShuffle_Width = cast<VectorType>(LHSShuffle->getOperand(0)->getType())->getNumElements();
+      for (unsigned i = 0; i < VWidth; ++i) {
+        int eltMask;
+      
+        if (Mask[i] < 0) {
+          // This element is an undef value.
           eltMask = -1;
-      } else
-        eltMask = Mask[i];
-    } else {
-      // This element is from right hand side vector operand
-      //
-      // If the value selected is an undef value, explicitly specify it
-      // with a -1 mask value. (case 1)
-      if (isa<UndefValue>(RHS))
+        } else if (Mask[i] < (int)LHSWidth) {
+          // This element is from left hand side vector operand.
+          //
+          eltMask = LHSMask[Mask[i]];
+        } else {
+          // This element is from right hand side vector operand
+          //
+          eltMask = RHSMask[Mask[i] - LHSWidth] + LHSShuffle_Width;
+        }
+
+        newMask.push_back(eltMask);
+      }
+      newLHS = LHSShuffle->getOperand(0);
+      newRHS = RHSShuffle->getOperand(0);
+    }
+    else {
+      return MadeChange ? &SVI : nullptr;
+    }
+  }
+  else {
+    SmallVector<int, 16> LHSMask;
+    SmallVector<int, 16> RHSMask;
+    if (newLHS != LHS)
+      LHSMask = LHSShuffle->getShuffleMask();
+    if (RHSShuffle && newRHS != RHS)
+      RHSMask = RHSShuffle->getShuffleMask();
+
+    unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+    bool isSplat = true;
+    int SplatElt = -1;
+    // Create a new mask for the new ShuffleVectorInst so that the new
+    // ShuffleVectorInst is equivalent to the original one.
+    for (unsigned i = 0; i < VWidth; ++i) {
+      int eltMask;
+      if (Mask[i] < 0) {
+        // This element is an undef value.
         eltMask = -1;
-      // If RHS is going to be replaced (case 3 or 4), calculate the
-      // new mask value for the element.
-      else if (newRHS != RHS) {
-        eltMask = RHSMask[Mask[i]-LHSWidth];
+      } else if (Mask[i] < (int)LHSWidth) {
+        // This element is from left hand side vector operand.
+        //
+        // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+        // new mask value for the element.
+        if (newLHS != LHS) {
+          eltMask = LHSMask[Mask[i]];
+          // If the value selected is an undef value, explicitly specify it
+          // with a -1 mask value.
+          if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+            eltMask = -1;
+        } else
+          eltMask = Mask[i];
+      } else {
+        // This element is from right hand side vector operand
+        //
         // If the value selected is an undef value, explicitly specify it
-        // with a -1 mask value.
-        if (eltMask >= (int)RHSOp0Width) {
-          assert(isa<UndefValue>(RHSShuffle->getOperand(1))
-                 && "should have been check above");
+        // with a -1 mask value. (case 1)
+        if (isa<UndefValue>(RHS))
           eltMask = -1;
-        }
-      } else
-        eltMask = Mask[i]-LHSWidth;
-
-      // If LHS's width is changed, shift the mask value accordingly.
-      // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
-      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
-      // If newRHS == newLHS, we want to remap any references from newRHS to
-      // newLHS so that we can properly identify splats that may occur due to
-      // obfuscation across the two vectors.
-      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
-        eltMask += newLHSWidth;
-    }
+        // If RHS is going to be replaced (case 3 or 4), calculate the
+        // new mask value for the element.
+        else if (newRHS != RHS) {
+          eltMask = RHSMask[Mask[i]-LHSWidth];
+          // If the value selected is an undef value, explicitly specify it
+          // with a -1 mask value.
+          if (eltMask >= (int)RHSOp0Width) {
+            assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+                   && "should have been check above");
+            eltMask = -1;
+          }
+        } else
+          eltMask = Mask[i]-LHSWidth;
+
+        // If LHS's width is changed, shift the mask value accordingly.
+        // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
+        // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+        // If newRHS == newLHS, we want to remap any references from newRHS to
+        // newLHS so that we can properly identify splats that may occur due to
+        // obfuscation across the two vectors.
+        if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
+          eltMask += newLHSWidth;
+      }
 
-    // Check if this could still be a splat.
-    if (eltMask >= 0) {
-      if (SplatElt >= 0 && SplatElt != eltMask)
-        isSplat = false;
-      SplatElt = eltMask;
-    }
+      // Check if this could still be a splat.
+      if (eltMask >= 0) {
+        if (SplatElt >= 0 && SplatElt != eltMask)
+          isSplat = false;
+        SplatElt = eltMask;
+      }
 
-    newMask.push_back(eltMask);
+      newMask.push_back(eltMask);
+    }
   }
 
   // If the result mask is equal to one of the original shuffle masks,
   // or is a splat, do the replacement.
-  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+//  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
     SmallVector<Constant*, 16> Elts;
     for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
       if (newMask[i] < 0) {
@@ -1445,8 +1490,8 @@
     }
     if (!newRHS)
       newRHS = UndefValue::get(newLHS->getType());
-    return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
-  }
+   return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
+//  }
 
   // If the result mask is an identity, replace uses of this instruction with
   // corresponding argument.
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6844,16 +6844,27 @@
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
+    // If this is the loop-latch compare for the back branch, just add the
+    // scalar value. Should this check be done in caller instead?
+    bool LikelyVectorized = true;
+    if (I->hasOneUse()) {
+      if (BranchInst *BI = dyn_cast<BranchInst>(I->use_begin()->getUser())) {
+        if (BI->getParent() == TheLoop->getLoopLatch())
+          LikelyVectorized = false;
+      }
+    }
     Type *ValTy = I->getOperand(0)->getType();
     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
-    VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+
+    if (LikelyVectorized)
+      VectorTy = ToVectorTy(ValTy, VF);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
   }
   case Instruction::Store:
   case Instruction::Load: {