Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2929,16 +2929,10 @@
   case Intrinsic::x86_avx_blendv_ps_256:
   case Intrinsic::x86_avx_blendv_pd_256:
   case Intrinsic::x86_avx2_pblendvb: {
-    // Convert blendv* to vector selects if the mask is constant.
-    // This optimization is convoluted because the intrinsic is defined as
-    // getting a vector of floats or doubles for the ps and pd versions.
-    // FIXME: That should be changed.
-
+    // fold (blend A, A, Mask) -> A
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
     Value *Mask = II->getArgOperand(2);
-
-    // fold (blend A, A, Mask) -> A
     if (Op0 == Op1)
       return replaceInstUsesWith(CI, Op0);
 
@@ -2951,6 +2945,20 @@
       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
+
+    // Convert to a vector select if we can bypass casts and find a boolean
+    // vector condition value.
+    Value *BoolVec;
+    if (match(peekThroughBitcast(Mask), m_SExt(m_Value(BoolVec)))) {
+      auto *VTy = dyn_cast<VectorType>(BoolVec->getType());
+      if (VTy && VTy->getScalarSizeInBits() == 1 &&
+          VTy->getVectorNumElements() == II->getType()->getVectorNumElements())
+        return SelectInst::Create(BoolVec, Op1, Op0);
+      // TODO: If we can find a boolean vector condition with less elements,
+      //       then we can form a vector select by bitcasting Op0/Op1 to a
+      //       vector type with wider elements and bitcasting the result.
+    }
+
     break;
   }
 
Index: llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll
+++ llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -157,9 +157,7 @@
 
 define <4 x float> @sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i1> %cond) {
 ; CHECK-LABEL: @sel_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = sext <4 x i1> [[COND:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[S]] to <4 x float>
-; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = sext <4 x i1> %cond to <4 x i32>
@@ -170,9 +168,7 @@
 
 define <2 x double> @sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i1> %cond) {
 ; CHECK-LABEL: @sel_v2f64(
-; CHECK-NEXT:    [[S:%.*]] = sext <2 x i1> [[COND:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[S]] to <2 x double>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[B]])
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %s = sext <2 x i1> %cond to <2 x i64>
@@ -198,8 +194,7 @@
 
 define <16 x i8> @sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i1> %cond) {
 ; CHECK-LABEL: @sel_v16i8(
-; CHECK-NEXT:    [[S:%.*]] = sext <16 x i1> [[COND:%.*]] to <16 x i8>
-; CHECK-NEXT:    [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[S]])
+; CHECK-NEXT:    [[R:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[X:%.*]]
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
 ;
   %s = sext <16 x i1> %cond to <16 x i8>
@@ -217,9 +212,7 @@
 ; CHECK-LABEL: @sel_v4f32_sse_reality(
 ; CHECK-NEXT:    [[LD:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x float> [[Z:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-; CHECK-NEXT:    [[R:%.*]] = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> [[LD]], <4 x float> zeroinitializer, <4 x float> [[COND]])
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[CMP]], <4 x float> zeroinitializer, <4 x float> [[LD]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %ld = load <4 x float>, <4 x float>* %x, align 16
@@ -234,9 +227,7 @@
 ; CHECK-LABEL: @sel_v2f64_sse_reality(
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* [[X:%.*]], align 16
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <2 x double> [[Z:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-; CHECK-NEXT:    [[COND:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
-; CHECK-NEXT:    [[R:%.*]] = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[LD]], <2 x double> zeroinitializer, <2 x double> [[COND]])
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP]], <2 x double> zeroinitializer, <2 x double> [[LD]]
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %ld = load <2 x double>, <2 x double>* %x, align 16
@@ -247,6 +238,8 @@
   ret <2 x double> %r
 }
 
+; TODO: We can bitcast the inputs to the select and the result and remove the intrinsic.
+
 define <2 x i64> @sel_v4i32_sse_reality(<2 x i64>* nocapture readonly %x, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: @sel_v4i32_sse_reality(
 ; CHECK-NEXT:    [[XCAST:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <16 x i8>*
@@ -279,8 +272,7 @@
 ; CHECK-NEXT:    [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <16 x i8>
 ; CHECK-NEXT:    [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <16 x i8>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <16 x i8> [[YCAST]], [[ZCAST]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <16 x i1> [[CMP]] to <16 x i8>
-; CHECK-NEXT:    [[R:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[LD]], <16 x i8> zeroinitializer, <16 x i8> [[SEXT]])
+; CHECK-NEXT:    [[R:%.*]] = select <16 x i1> [[CMP]], <16 x i8> zeroinitializer, <16 x i8> [[LD]]
 ; CHECK-NEXT:    [[RCAST:%.*]] = bitcast <16 x i8> [[R]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[RCAST]]
 ;