Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -896,6 +896,12 @@ APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); }; + auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width, + unsigned DemandedWidth) { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; switch (II->getIntrinsicID()) { default: break; @@ -1234,6 +1240,49 @@ break; } + case Intrinsic::x86_sse_add_ss: + case Intrinsic::x86_sse_sub_ss: + case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse_div_ss: + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_add_sd: + case Intrinsic::x86_sse2_sub_sd: + case Intrinsic::x86_sse2_mul_sd: + case Intrinsic::x86_sse2_div_sd: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: { + // These intrinsics only demand the lowest element of the second input + // vector. + Value *Arg1 = II->getArgOperand(1); + unsigned VWidth = Arg1->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + II->setArgOperand(1, V); + return II; + } + break; + } + + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // These intrinsics demand the upper elements of the first input vector and + // the lowest element of the second input vector. + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + unsigned VWidth = Arg0->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) { + II->setArgOperand(0, V); + return II; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + II->setArgOperand(1, V); + return II; + } + break; + } + // Constant fold ashr( , Ci ). // Constant fold lshr( , Ci ). // Constant fold shl( , Ci ). Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1179,16 +1179,40 @@ switch (II->getIntrinsicID()) { default: break; + // Unary vector operations that work column-wise. + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rsqrt_ss: + case Intrinsic::x86_sse_sqrt_ss: + case Intrinsic::x86_sse2_sqrt_sd: + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, + UndefElts, Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + + // If lowest element of a scalar op isn't used then use Arg0. + if (DemandedElts.getLoBits(1) != 1) + return II->getArgOperand(0); + break; + // Binary vector operations that work column-wise. A dest element is a // function of the corresponding input elements from the two inputs. + case Intrinsic::x86_sse_add_ss: case Intrinsic::x86_sse_sub_ss: case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse_div_ss: case Intrinsic::x86_sse_min_ss: case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_add_sd: case Intrinsic::x86_sse2_sub_sd: case Intrinsic::x86_sse2_mul_sd: + case Intrinsic::x86_sse2_div_sd: case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } @@ -1201,11 +1225,13 @@ if (DemandedElts == 1) { switch (II->getIntrinsicID()) { default: break; + case Intrinsic::x86_sse_add_ss: case Intrinsic::x86_sse_sub_ss: case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse2_add_sd: case Intrinsic::x86_sse2_sub_sd: case Intrinsic::x86_sse2_mul_sd: - // TODO: Lower MIN/MAX/ABS/etc + // TODO: Lower MIN/MAX/etc Value *LHS = II->getArgOperand(0); Value *RHS = II->getArgOperand(1); // Extract the element as scalars. @@ -1216,6 +1242,11 @@ switch (II->getIntrinsicID()) { default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_sse_add_ss: + case Intrinsic::x86_sse2_add_sd: + TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS, + II->getName()), *II); + break; case Intrinsic::x86_sse_sub_ss: case Intrinsic::x86_sse2_sub_sd: TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS, @@ -1238,6 +1269,10 @@ } } + // If lowest element of a scalar op isn't used then use Arg0. + if (DemandedElts.getLoBits(1) != 1) + return II->getArgOperand(0); + // Output elements are undefined if both are undefined. Consider things // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; Index: test/Transforms/InstCombine/x86-sse.ll =================================================================== --- test/Transforms/InstCombine/x86-sse.ll +++ test/Transforms/InstCombine/x86-sse.ll @@ -4,12 +4,9 @@ define float @test_rcp_ss_0(float %a) { ; CHECK-LABEL: @test_rcp_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4) -; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0 -; CHECK-NEXT: ret float %6 +; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1) +; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0 +; CHECK-NEXT: ret float %3 %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 @@ -19,15 +16,24 @@ ret float %6 } +define float @test_rcp_ss_1(float %a) { +; CHECK-LABEL: @test_rcp_ss_1 +; CHECK-NEXT: ret float 1.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4) + %6 = extractelement <4 x float> %5, i32 1 + ret float %6 +} + define float @test_sqrt_ss_0(float %a) { ; CHECK-LABEL: @test_sqrt_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4) -; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0 -; CHECK-NEXT: ret float %6 +; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %1) +; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0 +; CHECK-NEXT: ret float %3 %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 @@ -37,15 +43,24 @@ ret float %6 } +define float @test_sqrt_ss_2(float %a) { +; CHECK-LABEL: @test_sqrt_ss_2 +; CHECK-NEXT: ret float 2.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %4) + %6 = extractelement <4 x float> %5, i32 2 + ret float %6 +} + define float @test_rsqrt_ss_0(float %a) { ; CHECK-LABEL: @test_rsqrt_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4) -; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0 -; CHECK-NEXT: ret float %6 +; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1) +; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0 +; CHECK-NEXT: ret float %3 %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 @@ -55,19 +70,33 @@ ret float %6 } +define float @test_rsqrt_ss_3(float %a) { +; CHECK-LABEL: @test_rsqrt_ss_3 +; CHECK-NEXT: ret float 3.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4) + %6 = extractelement <4 x float> %5, i32 3 + ret float %6 +} + +define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_add_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_add_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_add_ss_0 -; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0 -; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1 -; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2 -; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3 -; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %8) -; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0 -; CHECK-NEXT: ret float %r +; CHECK-NEXT: %1 = fadd float %a, %b +; CHECK-NEXT: ret float %1 %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 @@ -81,6 +110,30 @@ ret float %r } +define float @test_add_ss_1(float %a, float %b) { +; CHECK-LABEL: @test_add_ss_1 +; CHECK-NEXT: ret float 1.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 1 + ret float %7 +} + +define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_sub_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_sub_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_sub_ss_0 ; CHECK-NEXT: %1 = fsub float %a, %b @@ -98,6 +151,30 @@ ret float %r } +define float @test_sub_ss_2(float %a, float %b) { +; CHECK-LABEL: @test_sub_ss_2 +; CHECK-NEXT: ret float 2.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 2 + ret float %7 +} + +define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_mul_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_mul_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_mul_ss_0 ; CHECK-NEXT: %1 = fmul float %a, %b @@ -115,18 +192,36 @@ ret float %r } +define float @test_mul_ss_3(float %a, float %b) { +; CHECK-LABEL: @test_mul_ss_3 +; CHECK-NEXT: ret float 3.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 3 + ret float %7 +} + +define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_div_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_div_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_div_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0 -; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1 -; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2 -; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3 -; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %8) -; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0 +; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0 +; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %1, <4 x float> %2) +; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0 ; CHECK-NEXT: ret float %r %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 @@ -141,6 +236,30 @@ ret float %r } +define float @test_div_ss_1(float %a, float %b) { +; CHECK-LABEL: @test_div_ss_1 +; CHECK-NEXT: ret float 1.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 1 + ret float %7 +} + +define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_min_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_min_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_min_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 @@ -161,6 +280,30 @@ ret float %10 } +define float @test_min_ss_2(float %a, float %b) { +; CHECK-LABEL: @test_min_ss_2 +; CHECK-NEXT: ret float 2.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 2 + ret float %7 +} + +define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_max_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %b) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a, <4 x float> %3) + ret <4 x float> %4 +} + define float @test_max_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_max_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 @@ -181,18 +324,36 @@ ret float %10 } +define float @test_max_ss_3(float %a, float %b) { +; CHECK-LABEL: @test_max_ss_3 +; CHECK-NEXT: ret float 3.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %5) + %7 = extractelement <4 x float> %6, i32 3 + ret float %7 +} + +define <4 x float> @test_cmp_ss(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_cmp_ss +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %b, i8 0) +; CHECK-NEXT: ret <4 x float> %1 + %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 + %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 + %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 + %4 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a, <4 x float> %3, i8 0) + ret <4 x float> %4 +} + define float @test_cmp_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_cmp_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0 -; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1 -; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2 -; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3 -; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %8, i8 0) -; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0 +; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %b, i32 0 +; CHECK-NEXT: %3 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0) +; CHECK-NEXT: %r = extractelement <4 x float> %3, i32 0 ; CHECK-NEXT: ret float %r %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 @@ -207,6 +368,19 @@ ret float %r } +define float @test_cmp_ss_1(float %a, float %b) { +; CHECK-LABEL: @test_cmp_ss_1 +; CHECK-NEXT: ret float 1.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %4, <4 x float> %5, i8 0) + %7 = extractelement <4 x float> %6, i32 1 + ret float %7 +} + define i32 @test_comieq_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_comieq_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 Index: test/Transforms/InstCombine/x86-sse2.ll =================================================================== --- test/Transforms/InstCombine/x86-sse2.ll +++ test/Transforms/InstCombine/x86-sse2.ll @@ -4,10 +4,9 @@ define double @test_sqrt_sd_0(double %a) { ; CHECK-LABEL: @test_sqrt_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) -; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0 -; CHECK-NEXT: ret double %4 +; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %1) +; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0 +; CHECK-NEXT: ret double %3 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) @@ -15,15 +14,29 @@ ret double %4 } +define double @test_sqrt_sd_1(double %a) { +; CHECK-LABEL: @test_sqrt_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) + %4 = extractelement <2 x double> %3, i32 1 + ret double %4 +} + +define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_add_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_add_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_add_sd_0 -; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0 -; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 -; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4) -; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0 -; CHECK-NEXT: ret double %6 +; CHECK-NEXT: %1 = fadd double %a, %b +; CHECK-NEXT: ret double %1 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = insertelement <2 x double> undef, double %b, i32 0 @@ -33,6 +46,27 @@ ret double %6 } +define double @test_add_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_add_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_sub_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_sub_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_sub_sd_0 ; CHECK-NEXT: %1 = fsub double %a, %b @@ -46,6 +80,27 @@ ret double %6 } +define double @test_sub_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_sub_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_mul_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_mul_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_mul_sd_0 ; CHECK-NEXT: %1 = fmul double %a, %b @@ -59,15 +114,34 @@ ret double %6 } +define double @test_mul_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_mul_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_div_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_div_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_div_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0 -; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 -; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4) -; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0 -; CHECK-NEXT: ret double %6 +; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0 +; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %1, <2 x double> %2) +; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0 +; CHECK-NEXT: ret double %4 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = insertelement <2 x double> undef, double %b, i32 0 @@ -77,6 +151,27 @@ ret double %6 } +define double @test_div_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_div_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_min_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_min_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_min_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 @@ -93,6 +188,27 @@ ret double %6 } +define double @test_min_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_min_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_max_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %b) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a, <2 x double> %1) + ret <2 x double> %2 +} + define double @test_max_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_max_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 @@ -109,15 +225,34 @@ ret double %6 } +define double @test_max_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_max_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %2, <2 x double> %4) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + +define <2 x double> @test_cmp_sd(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_cmp_sd +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %b, i8 0) +; CHECK-NEXT: ret <2 x double> %1 + %1 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 + %2 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a, <2 x double> %1, i8 0) + ret <2 x double> %2 +} + define double @test_cmp_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_cmp_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0 -; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 -; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0) -; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0 -; CHECK-NEXT: ret double %6 +; CHECK-NEXT: %2 = insertelement <2 x double> undef, double %b, i32 0 +; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) +; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0 +; CHECK-NEXT: ret double %4 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = insertelement <2 x double> undef, double %b, i32 0 @@ -127,6 +262,18 @@ ret double %6 } +define double @test_cmp_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_cmp_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %2, <2 x double> %4, i8 0) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + define i32 @test_comieq_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_comieq_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 Index: test/Transforms/InstCombine/x86-sse41.ll =================================================================== --- test/Transforms/InstCombine/x86-sse41.ll +++ test/Transforms/InstCombine/x86-sse41.ll @@ -3,10 +3,8 @@ define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_round_sd -; CHECK-NEXT: %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 -; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10) -; CHECK-NEXT: ret <2 x double> %3 +; CHECK-NEXT: %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10) +; CHECK-NEXT: ret <2 x double> %1 %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0 %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1 %3 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %1, <2 x double> %2, i32 10) @@ -15,13 +13,10 @@ define double @test_round_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_round_sd_0 -; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <2 x double> undef, double %b, i32 0 -; CHECK-NEXT: %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 -; CHECK-NEXT: %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10) -; CHECK-NEXT: %6 = extractelement <2 x double> %5, i32 0 -; CHECK-NEXT: ret double %6 +; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %b, i32 0 +; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> %1, i32 10) +; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0 +; CHECK-NEXT: ret double %3 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = insertelement <2 x double> undef, double %b, i32 0 @@ -31,16 +26,22 @@ ret double %6 } +define double @test_round_sd_1(double %a, double %b) { +; CHECK-LABEL: @test_round_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = insertelement <2 x double> undef, double %b, i32 0 + %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1 + %5 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %2, <2 x double> %4, i32 10) + %6 = extractelement <2 x double> %5, i32 1 + ret double %6 +} + define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_round -; CHECK-NEXT: %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 -; CHECK-NEXT: %4 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 -; CHECK-NEXT: %5 = insertelement <4 x float> %4, float 2.000000e+00, i32 2 -; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 3.000000e+00, i32 3 -; CHECK-NEXT: %7 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %3, <4 x float> %6, i32 10) -; CHECK-NEXT: ret <4 x float> %7 +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> , <4 x float> %b, i32 10) +; CHECK-NEXT: ret <4 x float> %1 %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 @@ -53,16 +54,9 @@ define float @test_round_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_round_ss_0 -; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = insertelement <4 x float> undef, float %b, i32 0 -; CHECK-NEXT: %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1 -; CHECK-NEXT: %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2 -; CHECK-NEXT: %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3 -; CHECK-NEXT: %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10) -; CHECK-NEXT: %r = extractelement <4 x float> %9, i32 0 +; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %b, i32 0 +; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> %1, i32 10) +; CHECK-NEXT: %r = extractelement <4 x float> %2, i32 0 ; CHECK-NEXT: ret float %r %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 @@ -77,5 +71,21 @@ ret float %r } +define float @test_round_ss_2(float %a, float %b) { +; CHECK-LABEL: @test_round_ss_2 +; CHECK-NEXT: ret float 2.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = insertelement <4 x float> undef, float %b, i32 0 + %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1 + %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2 + %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3 + %9 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %4, <4 x float> %8, i32 10) + %r = extractelement <4 x float> %9, i32 2 + ret float %r +} + declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone \ No newline at end of file Index: test/Transforms/InstCombine/x86-xop.ll =================================================================== --- test/Transforms/InstCombine/x86-xop.ll +++ test/Transforms/InstCombine/x86-xop.ll @@ -3,10 +3,9 @@ define double @test_vfrcz_sd_0(double %a) { ; CHECK-LABEL: @test_vfrcz_sd_0 ; CHECK-NEXT: %1 = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT: %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2) -; CHECK-NEXT: %4 = extractelement <2 x double> %3, i32 0 -; CHECK-NEXT: ret double %4 +; CHECK-NEXT: %2 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %1) +; CHECK-NEXT: %3 = extractelement <2 x double> %2, i32 0 +; CHECK-NEXT: ret double %3 %1 = insertelement <2 x double> undef, double %a, i32 0 %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2) @@ -14,15 +13,22 @@ ret double %4 } +define double @test_vfrcz_sd_1(double %a) { +; CHECK-LABEL: @test_vfrcz_sd_1 +; CHECK-NEXT: ret double 1.000000e+00 + %1 = insertelement <2 x double> undef, double %a, i32 0 + %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1 + %3 = tail call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %2) + %4 = extractelement <2 x double> %3, i32 1 + ret double %4 +} + define float @test_vfrcz_ss_0(float %a) { ; CHECK-LABEL: @test_vfrcz_ss_0 ; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT: %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 -; CHECK-NEXT: %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 -; CHECK-NEXT: %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 -; CHECK-NEXT: %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4) -; CHECK-NEXT: %6 = extractelement <4 x float> %5, i32 0 -; CHECK-NEXT: ret float %6 +; CHECK-NEXT: %2 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %1) +; CHECK-NEXT: %3 = extractelement <4 x float> %2, i32 0 +; CHECK-NEXT: ret float %3 %1 = insertelement <4 x float> undef, float %a, i32 0 %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 @@ -32,6 +38,18 @@ ret float %6 } +define float @test_vfrcz_ss_3(float %a) { +; CHECK-LABEL: @test_vfrcz_ss_3 +; CHECK-NEXT: ret float 3.000000e+00 + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 + %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2 + %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3 + %5 = tail call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %4) + %6 = extractelement <4 x float> %5, i32 3 + ret float %6 +} + define <2 x i64> @cmp_slt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: @cmp_slt_v2i64 ; CHECK-NEXT: %1 = icmp slt <2 x i64> %a, %b