Index: docs/CommandGuide/llvm-symbolizer.rst
===================================================================
--- docs/CommandGuide/llvm-symbolizer.rst
+++ docs/CommandGuide/llvm-symbolizer.rst
@@ -61,11 +61,14 @@
 -------
 
 .. option:: -obj
+
   Path to object file to be symbolized.
 
-.. option:: -functions
+.. option:: -functions=[none|short|linkage]
 
-  Print function names as well as source file/line locations. Defaults to true.
+  Specify the way function names are printed (omit function name,
+  print short function name, or print full linkage name, respectively).
+  Defaults to ``linkage``.
 
 .. option:: -use-symbol-table
 
Index: include/llvm/CodeGen/MachineOperand.h
===================================================================
--- include/llvm/CodeGen/MachineOperand.h
+++ include/llvm/CodeGen/MachineOperand.h
@@ -42,7 +42,7 @@
 ///
 class MachineOperand {
 public:
-  enum MachineOperandType {
+  enum MachineOperandType : unsigned char {
     MO_Register,          ///< Register operand.
     MO_Immediate,         ///< Immediate operand
     MO_CImmediate,        ///< Immediate >64bit operand
@@ -65,7 +65,7 @@
 private:
   /// OpKind - Specify what kind of operand this is.  This discriminates the
   /// union.
-  unsigned char OpKind; // MachineOperandType
+  MachineOperandType OpKind;
 
   /// Subregister number for MO_Register.  A value of 0 indicates the
   /// MO_Register has no subReg.
Index: include/llvm/DebugInfo/DIContext.h
===================================================================
--- include/llvm/DebugInfo/DIContext.h
+++ include/llvm/DebugInfo/DIContext.h
@@ -70,7 +70,7 @@
 /// should be filled with data.
 struct DILineInfoSpecifier {
   enum class FileLineInfoKind { None, Default, AbsoluteFilePath };
-  enum class FunctionNameKind { None, LinkageName };
+  enum class FunctionNameKind { None, ShortName, LinkageName };
 
   FileLineInfoKind FLIKind;
   FunctionNameKind FNKind;
Index: lib/DebugInfo/DWARFDebugInfoEntry.cpp
===================================================================
--- lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -277,13 +277,15 @@
                                               FunctionNameKind Kind) const {
   if (!isSubroutineDIE() || Kind == FunctionNameKind::None)
     return nullptr;
-  // Try to get mangled name if possible.
-  if (const char *name =
-      getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
-    return name;
-  if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name,
-                                                   nullptr))
-    return name;
+  // Try to get mangled name only if it was asked for.
+  if (Kind == FunctionNameKind::LinkageName) {
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
+      return name;
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_linkage_name, nullptr))
+      return name;
+  }
   if (const char *name = getAttributeValueAsString(U, DW_AT_name, nullptr))
     return name;
   // Try to get name from specification DIE.
Index: lib/Target/PowerPC/PPCAsmPrinter.cpp
===================================================================
--- lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -208,7 +208,7 @@
   }
 
   default:
-    O << "<unknown operand type: " << MO.getType() << ">";
+    O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
     return;
   }
 }
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -884,6 +884,7 @@
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1068,11 +1068,14 @@
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1188,10 +1191,10 @@
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
@@ -1236,6 +1239,7 @@
       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
@@ -4694,11 +4698,17 @@
   return getInsertVINSERTImmediate(N, 256);
 }
 
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
-    return CN->isNullValue();
+  if (isZero(Elt))
+    return true;
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
     return CFP->getValueAPF().isPosZero();
   return false;
@@ -7402,6 +7412,23 @@
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
 // It is only safe to call this function if isINSERTPSMask is true for
 // this shufflevector mask.
 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
@@ -7413,7 +7440,6 @@
   // If we're transferring an i32 from memory to a specific element in a
   // register, we output a generic DAG that will match the PINSRD
   // instruction.
-  // TODO: Optimize for AVX cases too (VINSERTPS)
   MVT VT = SVOp->getSimpleValueType(0);
   MVT EVT = VT.getVectorElementType();
   SDValue V1 = SVOp->getOperand(0);
@@ -7446,17 +7472,10 @@
     // Trivial case, when From comes from a load and is only used by the
     // shuffle. Make it use insertps from the vector that we need from that
     // load.
-    SDValue Addr = From.getOperand(1);
-    SDValue NewAddr =
-        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
-                                    Addr.getSimpleValueType()));
-
-    LoadSDNode *Load = cast<LoadSDNode>(From);
     SDValue NewLoad =
-        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                    DAG.getMachineFunction().getMachineMemOperand(
-                        Load->getMemOperand(), 0, EVT.getStoreSize()));
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
 
     if (EVT == MVT::f32) {
       // Create this as a scalar to vector to match the instruction pattern.
@@ -7961,6 +7980,105 @@
   return SDValue();
 }
 
+// This function assumes its argument is a BUILD_VECTOR of constand or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      MaskValue |= !!Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !!Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -10746,11 +10864,6 @@
   return false;
 }
 
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
@@ -14326,6 +14439,7 @@
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -17631,6 +17745,51 @@
   return std::make_pair(Opc, NeedSplit);
 }
 
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -18139,7 +18298,13 @@
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() &&
+      // We explicitly check against v8i16 and v16i16 because, although
+      // they're marked as Custom, they might only be legal when Cond is a
+      // build_vector of constants. This will be taken care in a later
+      // condition.
+      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+       VT != MVT::v8i16)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
@@ -18166,6 +18331,23 @@
       DCI.CommitTargetLoweringOpt(TLO);
   }
 
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
   return SDValue();
 }
 
@@ -20093,6 +20275,29 @@
   return SDValue();
 }
 
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert(VT == MVT::v4f32 ||
+         VT == MVT::v4i32 && "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
@@ -20396,6 +20601,8 @@
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -6550,6 +6550,27 @@
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
+let Predicates = [UseSSE41] in
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+
+let Predicates = [UseAVX] in
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1220,6 +1220,65 @@
     if (MadeChange) return &GEP;
   }
 
+  // Check to see if the inputs to the PHI node are getelementptr instructions.
+  if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
+    GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+    if (!Op1)
+      return nullptr;
+
+    signed DI = -1;
+
+    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+      GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      if (!Op2 || Op1->getNumOperands() != Op1->getNumOperands())
+        return nullptr;
+
+      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+        if (Op1->getOperand(J) != Op2->getOperand(J)) {
+          if (DI == -1) {
+            // We have not seen any differences yet in the GEPs feeding the
+            // PHI yet, so we record this one.
+            DI = J;
+          } else {
+            // The GEP is different by more than one input. While this could be
+            // extended to support GEPs that vary by more than one variable it
+            // doesn't make sense since it greatly increases the complexity and
+            // would result in an R+R+R addressing mode which no backend
+            // directly supports and would need to be broken into several
+            // simpler instructions anyway.
+            return nullptr;
+          }
+        }
+      }
+    }
+
+    GetElementPtrInst *NewGEP = dyn_cast<GetElementPtrInst>(Op1->clone());
+
+    if (DI == -1) {
+      // All the GEPs feeding the PHI are identical. Clone one down into our
+      // BB so that it can be merged with the current GEP.
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+    } else {
+      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+      // into the current block so it can be merged, and create a new PHI to
+      // set that index.
+      PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
+                                          PN->getNumOperands());
+      for (auto &I : PN->operands())
+        NewPN->addIncoming(dyn_cast<GEPOperator>(I)->getOperand(DI),
+                           PN->getIncomingBlock(I));
+
+      NewGEP->setOperand(DI, NewPN);
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+      NewGEP->setOperand(DI, NewPN);
+    }
+
+    GEP.setOperand(0, NewGEP);
+    PtrOp = NewGEP;
+  }
+
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction, combine the indices of the two
   // getelementptr instructions into a single instruction.
Index: test/Analysis/CostModel/X86/vselect-cost.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/X86/vselect-cost.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+; Verify the cost of vector select instructions.
+
+; SSE41 added blend instructions with an immediate for <2 x double> and
+; <4 x float>. Integers of the same size should also use those instructions.
+
+define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2i64':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %sel = select <2 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+  %sel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2double':
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %sel = select <2 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+  %sel = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %sel
+}
+
+define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i32':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4float':
+; SSE2: Cost Model: {{.*}} 7 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %sel
+}
+
+define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_16i8':
+; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+  %sel = select <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true>, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+; AVX added blend instructions with an immediate for <4 x double> and
+; <8 x float>. Integers of the same size should also use those instructions.
+define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i64':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4double':
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %a, <4 x double> %b
+  ret <4 x double> %sel
+}
+
+define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8i32':
+; SSE2: Cost Model: {{.*}} 16 for instruction:   %sel = select <8 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+  %sel = select <8 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 false>, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8float':
+; SSE2: Cost Model: {{.*}} 14 for instruction:   %sel = select <8 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+  %sel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %a, <8 x float> %b
+  ret <8 x float> %sel
+}
+
+; AVX2
+define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK:Printing analysis 'Cost Model Analysis' for function 'test_16i16':
+; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <16 x i1>
+;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
+; AVX: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+  %sel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <32 x i8> @test_32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_32i8':
+; SSE2: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <32 x i1>
+;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
+; AVX: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <32 x i1>
+  %sel = select <32 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true>, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
Index: test/CodeGen/X86/avx-blend.ll
===================================================================
--- test/CodeGen/X86/avx-blend.ll
+++ test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,7 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+;CHECK: vblendps    $5
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +12,7 @@
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $5
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +52,7 @@
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +61,7 @@
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,7 +69,8 @@
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $1
+;CHECK: vblendpd    $1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -77,7 +78,8 @@
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $1
+;CHECK: vblendpd    $1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -86,7 +88,7 @@
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vshufpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -111,5 +113,3 @@
   %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
   ret <2 x double> %min
 }
-
-
Index: test/CodeGen/X86/avx2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/avx2.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
Index: test/CodeGen/X86/blend-msb.ll
===================================================================
--- test/CodeGen/X86/blend-msb.ll
+++ test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
 ; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,8 @@
   ret <4 x i8> %vsel
 }
 
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+;CHECK: pblendw $17
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
Index: test/CodeGen/X86/fold-load-vec.ll
===================================================================
--- test/CodeGen/X86/fold-load-vec.ll
+++ test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
 ; loads from m32.
 define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
 ; CHECK: insertps
 entry:
   %source.addr = alloca <4 x float>*, align 8
Index: test/CodeGen/X86/sse41-blend.ll
===================================================================
--- test/CodeGen/X86/sse41-blend.ll
+++ test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@
 
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@
 }
 
 ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -576,3 +576,119 @@
   %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
   ret <4 x float> %res
 }
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
Index: test/DebugInfo/llvm-symbolizer.test
===================================================================
--- test/DebugInfo/llvm-symbolizer.test
+++ test/DebugInfo/llvm-symbolizer.test
@@ -10,7 +10,7 @@
 RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input
 RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input
 
-RUN: llvm-symbolizer --functions --inlining --demangle=false \
+RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck %s
 
 CHECK:       main
@@ -87,3 +87,9 @@
 RUN:   | FileCheck %s --check-prefix=STRIPPED
 
 STRIPPED:  global_func
+
+RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" > %t.input7
+RUN: llvm-symbolizer --functions=short --use-symbol-table=false --demangle=false < %t.input7 \
+RUN:    | FileCheck %s --check-prefix=SHORT_FUNCTION_NAME
+
+SHORT_FUNCTION_NAME-NOT: _Z1cv
Index: test/Transforms/InstCombine/gepphigep.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/gepphigep.ll
@@ -0,0 +1,56 @@
+; RUN: opt -instcombine -S  < %s | FileCheck %s
+
+%struct1 = type { %struct2*, i32, i32, i32 }
+%struct2 = type { i32, i32 }
+
+define i32 @test1(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  br i1 %tmp4, label %bb1, label %bb2
+
+bb1:
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  br label %bb3
+
+bb2:
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  br label %bb3
+
+bb3:
+  %phi = phi %struct2* [ %tmp10, %bb1 ], [ %tmp20, %bb2 ]
+  %tmp24 = getelementptr inbounds %struct2* %phi, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test1(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: %[[PHI:[0-9A-Za-z]+]] = phi i64 [ %tmp9, %bb1 ], [ %tmp19, %bb2 ]
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %[[PHI]], i32 1
+}
+
+define i32 @test2(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  %tmp24 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test2(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 1
+}
+
Index: tools/llvm-symbolizer/LLVMSymbolize.h
===================================================================
--- tools/llvm-symbolizer/LLVMSymbolize.h
+++ tools/llvm-symbolizer/LLVMSymbolize.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 
+typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 using namespace object;
 
 namespace symbolize {
@@ -34,17 +35,17 @@
 public:
   struct Options {
     bool UseSymbolTable : 1;
-    bool PrintFunctions : 1;
+    FunctionNameKind PrintFunctions;
     bool PrintInlining : 1;
     bool Demangle : 1;
     std::string DefaultArch;
-    Options(bool UseSymbolTable = true, bool PrintFunctions = true,
+    Options(bool UseSymbolTable = true,
+            FunctionNameKind PrintFunctions = FunctionNameKind::LinkageName,
             bool PrintInlining = true, bool Demangle = true,
             std::string DefaultArch = "")
         : UseSymbolTable(UseSymbolTable), PrintFunctions(PrintFunctions),
           PrintInlining(PrintInlining), Demangle(Demangle),
-          DefaultArch(DefaultArch) {
-    }
+          DefaultArch(DefaultArch) {}
   };
 
   LLVMSymbolizer(const Options &Opts = Options()) : Opts(Opts) {}
Index: tools/llvm-symbolizer/LLVMSymbolize.cpp
===================================================================
--- tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -39,8 +39,7 @@
 getDILineInfoSpecifier(const LLVMSymbolizer::Options &Opts) {
   return DILineInfoSpecifier(
       DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
-      Opts.PrintFunctions ? DILineInfoSpecifier::FunctionNameKind::LinkageName
-                          : DILineInfoSpecifier::FunctionNameKind::None);
+      Opts.PrintFunctions);
 }
 
 ModuleInfo::ModuleInfo(ObjectFile *Obj, DIContext *DICtx)
@@ -117,7 +116,7 @@
         ModuleOffset, getDILineInfoSpecifier(Opts));
   }
   // Override function name from symbol table if necessary.
-  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+  if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) {
     std::string FunctionName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
@@ -140,7 +139,7 @@
     InlinedContext.addFrame(DILineInfo());
   }
   // Override the function name in lower frame with name from symbol table.
-  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+  if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) {
     DIInliningInfo PatchedInlinedContext;
     for (uint32_t i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) {
       DILineInfo LineInfo = InlinedContext.getFrame(i);
@@ -398,7 +397,7 @@
   // cannot fetch. We replace it to "??" to make our output closer to addr2line.
   static const std::string kDILineInfoBadString = "<invalid>";
   std::stringstream Result;
-  if (Opts.PrintFunctions) {
+  if (Opts.PrintFunctions != FunctionNameKind::None) {
     std::string FunctionName = LineInfo.FunctionName;
     if (FunctionName == kDILineInfoBadString)
       FunctionName = kBadString;
Index: tools/llvm-symbolizer/llvm-symbolizer.cpp
===================================================================
--- tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -35,10 +35,15 @@
                  cl::desc("Prefer names in symbol table to names "
                           "in debug info"));
 
-static cl::opt<bool>
-ClPrintFunctions("functions", cl::init(true),
-                 cl::desc("Print function names as well as line "
-                          "information for a given address"));
+static cl::opt<FunctionNameKind> ClPrintFunctions(
+    "functions", cl::init(FunctionNameKind::LinkageName),
+    cl::desc("Print function name for a given address:"),
+    cl::values(clEnumValN(FunctionNameKind::None, "none", "omit function name"),
+               clEnumValN(FunctionNameKind::ShortName, "short",
+                          "print short function name"),
+               clEnumValN(FunctionNameKind::LinkageName, "linkage",
+                          "print function linkage name"),
+               clEnumValEnd));
 
 static cl::opt<bool>
 ClPrintInlining("inlining", cl::init(true),