Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -884,6 +884,7 @@
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1068,11 +1068,14 @@
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1188,10 +1191,10 @@
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
@@ -1236,6 +1239,7 @@
       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
@@ -4694,11 +4698,17 @@
   return getInsertVINSERTImmediate(N, 256);
 }
 
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
-    return CN->isNullValue();
+  if (isZero(Elt))
+    return true;
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
     return CFP->getValueAPF().isPosZero();
   return false;
@@ -7961,6 +7971,117 @@
   return SDValue();
 }
 
+static bool BUILD_VECTORtoBlendMask(SDValue BuildVector, unsigned &MaskValue,
+                                    SelectionDAG &DAG) {
+  // Check the mask for BLEND and build the value.
+  MaskValue = 0;
+  unsigned NumElems = BuildVector.getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+  SDValue Undef = DAG.getUNDEF(MVT::i1);
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    // The mask bits have to come from a BUILD_VECTOR of constants
+
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector.getOperand(i + NumElemsInLane) : Undef;
+    SDValue EltCond = BuildVector.getOperand(i);
+
+    int Lane1Cond, Lane2Cond;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    else {
+      if (EltCond.getOpcode() == ISD::UNDEF)
+        Lane1Cond = -1;
+      else
+        return false;
+    }
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+    else {
+      if (SndLaneEltCond.getOpcode() == ISD::UNDEF)
+        Lane2Cond = -1;
+      else
+        return false;
+    }
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      MaskValue |= !!Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !!Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+  if (Cond.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(Cond, MaskValue, DAG))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -10746,11 +10867,6 @@
   return false;
 }
 
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
@@ -14326,6 +14442,7 @@
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -17631,6 +17748,50 @@
   return std::make_pair(Opc, NeedSplit);
 }
 
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+  if (Cond.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(Cond, MaskValue, DAG))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -18166,6 +18327,23 @@
       DCI.CommitTargetLoweringOpt(TLO);
   }
 
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
   return SDValue();
 }
 
Index: test/CodeGen/X86/avx-blend.ll
===================================================================
--- test/CodeGen/X86/avx-blend.ll
+++ test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,7 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+;CHECK: vblendps    $5
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +12,7 @@
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $5
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +52,7 @@
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +61,7 @@
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,7 +69,8 @@
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $1
+;CHECK: vblendpd    $1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -77,7 +78,8 @@
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $1
+;CHECK: vblendpd    $1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -86,7 +88,7 @@
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vshufpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -111,5 +113,3 @@
   %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
   ret <2 x double> %min
 }
-
-
Index: test/CodeGen/X86/avx2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/avx2.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
Index: test/CodeGen/X86/blend-msb.ll
===================================================================
--- test/CodeGen/X86/blend-msb.ll
+++ test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
 ; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,8 @@
   ret <4 x i8> %vsel
 }
 
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+;CHECK: pblendw $17
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
Index: test/CodeGen/X86/sse41-blend.ll
===================================================================
--- test/CodeGen/X86/sse41-blend.ll
+++ test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@
 
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@
 }
 
 ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -576,3 +576,11 @@
   %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
   ret <4 x float> %res
 }
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}