Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -896,6 +896,12 @@
     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
     return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
   };
+  auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width,
+                                              unsigned DemandedWidth) {
+    APInt UndefElts(Width, 0);
+    APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth);
+    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+  };
 
   switch (II->getIntrinsicID()) {
   default: break;
@@ -1234,6 +1240,49 @@
     break;
   }
 
+  case Intrinsic::x86_sse_add_ss:
+  case Intrinsic::x86_sse_sub_ss:
+  case Intrinsic::x86_sse_mul_ss:
+  case Intrinsic::x86_sse_div_ss:
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse2_add_sd:
+  case Intrinsic::x86_sse2_sub_sd:
+  case Intrinsic::x86_sse2_mul_sd:
+  case Intrinsic::x86_sse2_div_sd:
+  case Intrinsic::x86_sse2_min_sd:
+  case Intrinsic::x86_sse2_max_sd:
+  case Intrinsic::x86_sse2_cmp_sd: {
+    // These intrinsics only demand the lowest element of the second input
+    // vector.
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg1->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_round_ss:
+  case Intrinsic::x86_sse41_round_sd: {
+    // These intrinsics demand the upper elements of the first input vector and
+    // the lowest element of the second input vector.
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    unsigned VWidth = Arg0->getType()->getVectorNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+      II->setArgOperand(1, V);
+      return II;
+    }
+    break;
+  }
+
   // Constant fold ashr( <A x Bi>, Ci ).
   // Constant fold lshr( <A x Bi>, Ci ).
   // Constant fold shl( <A x Bi>, Ci ).
Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1179,16 +1179,40 @@
     switch (II->getIntrinsicID()) {
     default: break;
 
+    // Unary vector operations that work column-wise.
+    case Intrinsic::x86_sse_rcp_ss:
+    case Intrinsic::x86_sse_rsqrt_ss:
+    case Intrinsic::x86_sse_sqrt_ss:
+    case Intrinsic::x86_sse2_sqrt_sd:
+    case Intrinsic::x86_xop_vfrcz_ss:
+    case Intrinsic::x86_xop_vfrcz_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth + 1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (DemandedElts.getLoBits(1) != 1)
+        return II->getArgOperand(0);
+      break;
+
     // Binary vector operations that work column-wise.  A dest element is a
     // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_add_ss:
     case Intrinsic::x86_sse_sub_ss:
     case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_div_ss:
     case Intrinsic::x86_sse_min_ss:
     case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse_cmp_ss:
+    case Intrinsic::x86_sse2_add_sd:
     case Intrinsic::x86_sse2_sub_sd:
     case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_div_sd:
     case Intrinsic::x86_sse2_min_sd:
     case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse2_cmp_sd:
+    case Intrinsic::x86_sse41_round_ss:
+    case Intrinsic::x86_sse41_round_sd:
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
                                         UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1201,11 +1225,13 @@
       if (DemandedElts == 1) {
         switch (II->getIntrinsicID()) {
         default: break;
+        case Intrinsic::x86_sse_add_ss:
         case Intrinsic::x86_sse_sub_ss:
         case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_add_sd:
         case Intrinsic::x86_sse2_sub_sd:
         case Intrinsic::x86_sse2_mul_sd:
-          // TODO: Lower MIN/MAX/ABS/etc
+          // TODO: Lower MIN/MAX/etc
           Value *LHS = II->getArgOperand(0);
           Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
@@ -1216,6 +1242,11 @@
 
           switch (II->getIntrinsicID()) {
           default: llvm_unreachable("Case stmts out of sync!");
+          case Intrinsic::x86_sse_add_ss:
+          case Intrinsic::x86_sse2_add_sd:
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
             TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
@@ -1238,6 +1269,10 @@
         }
       }
 
+      // If lowest element of a scalar op isn't used then use Arg0.
+      if (DemandedElts.getLoBits(1) != 1)
+        return II->getArgOperand(0);
+
       // Output elements are undefined if both are undefined.  Consider things
       // like undef&0.  The result is known zero, not undef.
       UndefElts &= UndefElts2;
Index: test/Transforms/InstCombine/x86-sse.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse.ll
+++ test/Transforms/InstCombine/x86-sse.ll
@@ -4,12 +4,9 @@
 define float @test_rcp_ss_0(float %a) {
 ; CHECK-LABEL: @test_rcp_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
   %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -19,15 +16,24 @@
   ret float %6
 }
 
+define float @test_rcp_ss_1(float %a) {
+; CHECK-LABEL: @test_rcp_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 1
+  ret float %6
+}
+
 define float @test_sqrt_ss_0(float %a) {
 ; CHECK-LABEL: @test_sqrt_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
   %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -37,15 +43,24 @@
   ret float %6
 }
 
+define float @test_sqrt_ss_2(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 2
+  ret float %6
+}
+
 define float @test_rsqrt_ss_0(float %a) {
 ; CHECK-LABEL: @test_rsqrt_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
   %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -55,19 +70,33 @@
   ret float %6
 }
 
+define float @test_rsqrt_ss_3(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_add_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_add_ss_0
-; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
-; CHECK-NEXT: ret float %r
+; CHECK-NEXT: %1 = fadd float %a, %b
+; CHECK-NEXT: ret float %1
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
   %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -81,6 +110,30 @@
   ret float %r
 }
 
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_sub_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_sub_ss_0
 ; CHECK-NEXT: %1 = fsub float %a, %b
@@ -98,6 +151,30 @@
   ret float %r
 }
 
+define float @test_sub_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_mul_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_mul_ss_0
 ; CHECK-NEXT: %1 = fmul float %a, %b
@@ -115,18 +192,36 @@
   ret float %r
 }
 
+define float @test_mul_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_div_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_div_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %1, <4 x float> %2)
+; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0
 ; CHECK-NEXT: ret float %r
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -141,6 +236,30 @@
   ret float %r
 }
 
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_min_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_min_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
@@ -161,6 +280,30 @@
   ret float %10
 }
 
+define float @test_min_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 2
+  ret float %7
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3)
+  ret <4 x float> %4
+}
+
 define float @test_max_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_max_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
@@ -181,18 +324,36 @@
   ret float %10
 }
 
+define float @test_max_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5)
+  %7 = extractelement <4 x float> %6, i32 3
+  ret float %7
+}
+
+define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_cmp_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %b, i8 0)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0)
+  ret <4 x float> %4
+}
+
 define float @test_cmp_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_cmp_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
+; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0
 ; CHECK-NEXT: ret float %r
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -207,6 +368,19 @@
   ret float %r
 }
 
+define float @test_cmp_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0)
+  %7 = extractelement <4 x float> %6, i32 1
+  ret float %7
+}
+
 define i32 @test_comieq_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_comieq_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
Index: test/Transforms/InstCombine/x86-sse2.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse2.ll
+++ test/Transforms/InstCombine/x86-sse2.ll
@@ -4,10 +4,9 @@
 define double @test_sqrt_sd_0(double %a) {
 ; CHECK-LABEL: @test_sqrt_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
-; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
-; CHECK-NEXT: ret double %4
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %1)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -15,15 +14,29 @@
   ret double %4
 }
 
+define double @test_sqrt_sd_1(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_add_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_add_sd_0
-; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %1 = fadd double %a, %b
+; CHECK-NEXT: ret double %1
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = insertelement <2 x double> undef, double %b, i32 0
@@ -33,6 +46,27 @@
   ret double %6 
 }
 
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_sub_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_sub_sd_0
 ; CHECK-NEXT: %1 = fsub double %a, %b
@@ -46,6 +80,27 @@
   ret double %6 
 }
 
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_mul_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_mul_sd_0
 ; CHECK-NEXT: %1 = fmul double %a, %b
@@ -59,15 +114,34 @@
   ret double %6 
 }
 
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_div_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_div_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %1, <2 x double> %2)
+; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
+; CHECK-NEXT: ret double %4
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = insertelement <2 x double> undef, double %b, i32 0
@@ -77,6 +151,27 @@
   ret double %6 
 }
 
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_min_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_min_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
@@ -93,6 +188,27 @@
   ret double %6 
 }
 
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1)
+  ret <2 x double> %2
+}
+
 define double @test_max_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_max_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
@@ -109,15 +225,34 @@
   ret double %6 
 }
 
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
+define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_cmp_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %b, i8 0)
+; CHECK-NEXT: ret <2 x double> %1
+  %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0)
+  ret <2 x double> %2
+}
+
 define double @test_cmp_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_cmp_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
+; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
+; CHECK-NEXT: ret double %4
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = insertelement <2 x double> undef, double %b, i32 0
@@ -127,6 +262,18 @@
   ret double %6
 }
 
+define double @test_cmp_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
 define i32 @test_comieq_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_comieq_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
Index: test/Transforms/InstCombine/x86-sse41.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse41.ll
+++ test/Transforms/InstCombine/x86-sse41.ll
@@ -3,10 +3,8 @@
 
 define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_round_sd
-; CHECK-NEXT: %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
-; CHECK-NEXT: ret <2 x double> %3
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10)
+; CHECK-NEXT: ret <2 x double> %1
   %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
   %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
   %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
@@ -15,13 +13,10 @@
 
 define double @test_round_sd_0(double %a, double %b) {
 ; CHECK-LABEL: @test_round_sd_0
-; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> %1, i32 10)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = insertelement <2 x double> undef, double %b, i32 0
@@ -31,16 +26,22 @@
   ret double %6 
 }
 
+define double @test_round_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6 
+}
+
 define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_round
-; CHECK-NEXT: %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-; CHECK-NEXT: %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-; CHECK-NEXT: %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3
-; CHECK-NEXT: %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10)
-; CHECK-NEXT: ret <4 x float> %7
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> %b, i32 10)
+; CHECK-NEXT: ret <4 x float> %1
   %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
   %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
@@ -53,16 +54,9 @@
 
 define float @test_round_ss_0(float %a, float %b) {
 ; CHECK-LABEL: @test_round_ss_0
-; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> %1, i32 10)
+; CHECK-NEXT: %r = extractelement <4 x float> %2, i32 0
 ; CHECK-NEXT: ret float %r
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -77,5 +71,21 @@
   ret float %r
 }
 
+define float @test_round_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+  %r = extractelement <4 x float> %9, i32 2
+  ret float %r
+}
+
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
\ No newline at end of file
Index: test/Transforms/InstCombine/x86-xop.ll
===================================================================
--- test/Transforms/InstCombine/x86-xop.ll
+++ test/Transforms/InstCombine/x86-xop.ll
@@ -3,10 +3,9 @@
 define double @test_vfrcz_sd_0(double %a) {
 ; CHECK-LABEL: @test_vfrcz_sd_0
 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
-; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
-; CHECK-NEXT: ret double %4
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
   %1 = insertelement <2 x double> undef, double %a, i32 0
   %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
   %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
@@ -14,15 +13,22 @@
   ret double %4
 }
 
+define double @test_vfrcz_sd_1(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+  %4 = extractelement <2 x double> %3, i32 1
+  ret double %4
+}
+
 define float @test_vfrcz_ss_0(float %a) {
 ; CHECK-LABEL: @test_vfrcz_ss_0
 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
   %1 = insertelement <4 x float> undef, float %a, i32 0
   %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
   %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -32,6 +38,18 @@
   ret float %6
 }
 
+define float @test_vfrcz_ss_3(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+  %6 = extractelement <4 x float> %5, i32 3
+  ret float %6
+}
+
 define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: @cmp_slt_v2i64
 ; CHECK-NEXT: %1 = icmp slt <2 x i64> %a, %b