Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -896,6 +896,12 @@
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
};
+ auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width,
+ unsigned DemandedWidth) {
+ APInt UndefElts(Width, 0);
+ APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth);
+ return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+ };
switch (II->getIntrinsicID()) {
default: break;
@@ -1234,6 +1240,49 @@
break;
}
+ case Intrinsic::x86_sse_add_ss:
+ case Intrinsic::x86_sse_sub_ss:
+ case Intrinsic::x86_sse_mul_ss:
+ case Intrinsic::x86_sse_div_ss:
+ case Intrinsic::x86_sse_min_ss:
+ case Intrinsic::x86_sse_max_ss:
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_add_sd:
+ case Intrinsic::x86_sse2_sub_sd:
+ case Intrinsic::x86_sse2_mul_sd:
+ case Intrinsic::x86_sse2_div_sd:
+ case Intrinsic::x86_sse2_min_sd:
+ case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_cmp_sd: {
+ // These intrinsics only demand the lowest element of the second input
+ // vector.
+ Value *Arg1 = II->getArgOperand(1);
+ unsigned VWidth = Arg1->getType()->getVectorNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+ II->setArgOperand(1, V);
+ return II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ // These intrinsics demand the upper elements of the first input vector and
+ // the lowest element of the second input vector.
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+ unsigned VWidth = Arg0->getType()->getVectorNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) {
+ II->setArgOperand(0, V);
+ return II;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+ II->setArgOperand(1, V);
+ return II;
+ }
+ break;
+ }
+
// Constant fold ashr( , Ci ).
// Constant fold lshr( , Ci ).
// Constant fold shl( , Ci ).
Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1179,16 +1179,40 @@
switch (II->getIntrinsicID()) {
default: break;
+ // Unary vector operations that work column-wise.
+ case Intrinsic::x86_sse_rcp_ss:
+ case Intrinsic::x86_sse_rsqrt_ss:
+ case Intrinsic::x86_sse_sqrt_ss:
+ case Intrinsic::x86_sse2_sqrt_sd:
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd:
+ TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+ UndefElts, Depth + 1);
+ if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (DemandedElts.getLoBits(1) != 1)
+ return II->getArgOperand(0);
+ break;
+
// Binary vector operations that work column-wise. A dest element is a
// function of the corresponding input elements from the two inputs.
+ case Intrinsic::x86_sse_add_ss:
case Intrinsic::x86_sse_sub_ss:
case Intrinsic::x86_sse_mul_ss:
+ case Intrinsic::x86_sse_div_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_add_sd:
case Intrinsic::x86_sse2_sub_sd:
case Intrinsic::x86_sse2_mul_sd:
+ case Intrinsic::x86_sse2_div_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_cmp_sd:
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd:
TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
UndefElts, Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
@@ -1201,11 +1225,13 @@
if (DemandedElts == 1) {
switch (II->getIntrinsicID()) {
default: break;
+ case Intrinsic::x86_sse_add_ss:
case Intrinsic::x86_sse_sub_ss:
case Intrinsic::x86_sse_mul_ss:
+ case Intrinsic::x86_sse2_add_sd:
case Intrinsic::x86_sse2_sub_sd:
case Intrinsic::x86_sse2_mul_sd:
- // TODO: Lower MIN/MAX/ABS/etc
+ // TODO: Lower MIN/MAX/etc
Value *LHS = II->getArgOperand(0);
Value *RHS = II->getArgOperand(1);
// Extract the element as scalars.
@@ -1216,6 +1242,11 @@
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_sse_add_ss:
+ case Intrinsic::x86_sse2_add_sd:
+ TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS,
+ II->getName()), *II);
+ break;
case Intrinsic::x86_sse_sub_ss:
case Intrinsic::x86_sse2_sub_sd:
TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
@@ -1238,6 +1269,10 @@
}
}
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (DemandedElts.getLoBits(1) != 1)
+ return II->getArgOperand(0);
+
// Output elements are undefined if both are undefined. Consider things
// like undef&0. The result is known zero, not undef.
UndefElts &= UndefElts2;
Index: test/Transforms/InstCombine/x86-sse.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse.ll
+++ test/Transforms/InstCombine/x86-sse.ll
@@ -4,12 +4,9 @@
define float @test_rcp_ss_0(float %a) {
; CHECK-LABEL: @test_rcp_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
%3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -19,15 +16,24 @@
ret float %6
}
+define float @test_rcp_ss_1(float %a) {
+; CHECK-LABEL: @test_rcp_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4)
+ %6 = extractelement <4 x float> %5, i32 1
+ ret float %6
+}
+
define float @test_sqrt_ss_0(float %a) {
; CHECK-LABEL: @test_sqrt_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
%3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -37,15 +43,24 @@
ret float %6
}
+define float @test_sqrt_ss_2(float %a) {
+; CHECK-LABEL: @test_sqrt_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4)
+ %6 = extractelement <4 x float> %5, i32 2
+ ret float %6
+}
+
define float @test_rsqrt_ss_0(float %a) {
; CHECK-LABEL: @test_rsqrt_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
%3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -55,19 +70,33 @@
ret float %6
}
+define float @test_rsqrt_ss_3(float %a) {
+; CHECK-LABEL: @test_rsqrt_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4)
+ %6 = extractelement <4 x float> %5, i32 3
+ ret float %6
+}
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_add_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_add_ss_0
-; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
-; CHECK-NEXT: ret float %r
+; CHECK-NEXT: %1 = fadd float %a, %b
+; CHECK-NEXT: ret float %1
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
%3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -81,6 +110,30 @@
ret float %r
}
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 1
+ ret float %7
+}
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_sub_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_sub_ss_0
; CHECK-NEXT: %1 = fsub float %a, %b
@@ -98,6 +151,30 @@
ret float %r
}
+define float @test_sub_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 2
+ ret float %7
+}
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_mul_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_mul_ss_0
; CHECK-NEXT: %1 = fmul float %a, %b
@@ -115,18 +192,36 @@
ret float %r
}
+define float @test_mul_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 3
+ ret float %7
+}
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_div_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_div_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %1, <4 x float> %2)
+; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0
; CHECK-NEXT: ret float %r
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -141,6 +236,30 @@
ret float %r
}
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 1
+ ret float %7
+}
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_min_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_min_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
@@ -161,6 +280,30 @@
ret float %10
}
+define float @test_min_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 2
+ ret float %7
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3)
+ ret <4 x float> %4
+}
+
define float @test_max_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_max_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
@@ -181,18 +324,36 @@
ret float %10
}
+define float @test_max_ss_3(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5)
+ %7 = extractelement <4 x float> %6, i32 3
+ ret float %7
+}
+
+define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_cmp_ss
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %b, i8 0)
+; CHECK-NEXT: ret <4 x float> %1
+ %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+ %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+ %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+ %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0)
+ ret <4 x float> %4
+}
+
define float @test_cmp_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_cmp_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
+; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0
; CHECK-NEXT: ret float %r
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -207,6 +368,19 @@
ret float %r
}
+define float @test_cmp_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_cmp_ss_1
+; CHECK-NEXT: ret float 1.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0)
+ %7 = extractelement <4 x float> %6, i32 1
+ ret float %7
+}
+
define i32 @test_comieq_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_comieq_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
Index: test/Transforms/InstCombine/x86-sse2.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse2.ll
+++ test/Transforms/InstCombine/x86-sse2.ll
@@ -4,10 +4,9 @@
define double @test_sqrt_sd_0(double %a) {
; CHECK-LABEL: @test_sqrt_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
-; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
-; CHECK-NEXT: ret double %4
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %1)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -15,15 +14,29 @@
ret double %4
}
+define double @test_sqrt_sd_1(double %a) {
+; CHECK-LABEL: @test_sqrt_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
+ %4 = extractelement <2 x double> %3, i32 1
+ ret double %4
+}
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_add_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_add_sd_0
-; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %1 = fadd double %a, %b
+; CHECK-NEXT: ret double %1
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = insertelement <2 x double> undef, double %b, i32 0
@@ -33,6 +46,27 @@
ret double %6
}
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_sub_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_sub_sd_0
; CHECK-NEXT: %1 = fsub double %a, %b
@@ -46,6 +80,27 @@
ret double %6
}
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_mul_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_mul_sd_0
; CHECK-NEXT: %1 = fmul double %a, %b
@@ -59,15 +114,34 @@
ret double %6
}
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_div_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_div_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %1, <2 x double> %2)
+; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
+; CHECK-NEXT: ret double %4
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = insertelement <2 x double> undef, double %b, i32 0
@@ -77,6 +151,27 @@
ret double %6
}
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_min_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_min_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
@@ -93,6 +188,27 @@
ret double %6
}
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %b)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1)
+ ret <2 x double> %2
+}
+
define double @test_max_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_max_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
@@ -109,15 +225,34 @@
ret double %6
}
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
+define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_cmp_sd
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %b, i8 0)
+; CHECK-NEXT: ret <2 x double> %1
+ %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+ %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0)
+ ret <2 x double> %2
+}
+
define double @test_cmp_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_cmp_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
+; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
+; CHECK-NEXT: ret double %4
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = insertelement <2 x double> undef, double %b, i32 0
@@ -127,6 +262,18 @@
ret double %6
}
+define double @test_cmp_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_cmp_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
define i32 @test_comieq_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_comieq_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
Index: test/Transforms/InstCombine/x86-sse41.ll
===================================================================
--- test/Transforms/InstCombine/x86-sse41.ll
+++ test/Transforms/InstCombine/x86-sse41.ll
@@ -3,10 +3,8 @@
define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @test_round_sd
-; CHECK-NEXT: %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
-; CHECK-NEXT: ret <2 x double> %3
+; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10)
+; CHECK-NEXT: ret <2 x double> %1
%1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0
%2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
%3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10)
@@ -15,13 +13,10 @@
define double @test_round_sd_0(double %a, double %b) {
; CHECK-LABEL: @test_round_sd_0
-; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0
-; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
-; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
-; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0
-; CHECK-NEXT: ret double %6
+; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %b, i32 0
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> %1, i32 10)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = insertelement <2 x double> undef, double %b, i32 0
@@ -31,16 +26,22 @@
ret double %6
}
+define double @test_round_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_round_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = insertelement <2 x double> undef, double %b, i32 0
+ %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+ %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10)
+ %6 = extractelement <2 x double> %5, i32 1
+ ret double %6
+}
+
define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @test_round
-; CHECK-NEXT: %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
-; CHECK-NEXT: %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
-; CHECK-NEXT: %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3
-; CHECK-NEXT: %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10)
-; CHECK-NEXT: ret <4 x float> %7
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> , <4 x float> %b, i32 10)
+; CHECK-NEXT: ret <4 x float> %1
%1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
%2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
%3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
@@ -53,16 +54,9 @@
define float @test_round_ss_0(float %a, float %b) {
; CHECK-LABEL: @test_round_ss_0
-; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0
-; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
-; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
-; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
-; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
-; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0
+; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %b, i32 0
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> %1, i32 10)
+; CHECK-NEXT: %r = extractelement <4 x float> %2, i32 0
; CHECK-NEXT: ret float %r
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
@@ -77,5 +71,21 @@
ret float %r
}
+define float @test_round_ss_2(float %a, float %b) {
+; CHECK-LABEL: @test_round_ss_2
+; CHECK-NEXT: ret float 2.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = insertelement <4 x float> undef, float %b, i32 0
+ %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+ %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+ %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+ %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10)
+ %r = extractelement <4 x float> %9, i32 2
+ ret float %r
+}
+
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
\ No newline at end of file
Index: test/Transforms/InstCombine/x86-xop.ll
===================================================================
--- test/Transforms/InstCombine/x86-xop.ll
+++ test/Transforms/InstCombine/x86-xop.ll
@@ -3,10 +3,9 @@
define double @test_vfrcz_sd_0(double %a) {
; CHECK-LABEL: @test_vfrcz_sd_0
; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0
-; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
-; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0
-; CHECK-NEXT: ret double %4
+; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1)
+; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0
+; CHECK-NEXT: ret double %3
%1 = insertelement <2 x double> undef, double %a, i32 0
%2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
%3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
@@ -14,15 +13,22 @@
ret double %4
}
+define double @test_vfrcz_sd_1(double %a) {
+; CHECK-LABEL: @test_vfrcz_sd_1
+; CHECK-NEXT: ret double 1.000000e+00
+ %1 = insertelement <2 x double> undef, double %a, i32 0
+ %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+ %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2)
+ %4 = extractelement <2 x double> %3, i32 1
+ ret double %4
+}
+
define float @test_vfrcz_ss_0(float %a) {
; CHECK-LABEL: @test_vfrcz_ss_0
; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0
-; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
-; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
-; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
-; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
-; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0
-; CHECK-NEXT: ret float %6
+; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %1)
+; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-NEXT: ret float %3
%1 = insertelement <4 x float> undef, float %a, i32 0
%2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
%3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
@@ -32,6 +38,18 @@
ret float %6
}
+define float @test_vfrcz_ss_3(float %a) {
+; CHECK-LABEL: @test_vfrcz_ss_3
+; CHECK-NEXT: ret float 3.000000e+00
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+ %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+ %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+ %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4)
+ %6 = extractelement <4 x float> %5, i32 3
+ ret float %6
+}
+
define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: @cmp_slt_v2i64
; CHECK-NEXT: %1 = icmp slt <2 x i64> %a, %b