Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30253,9 +30253,9 @@ // Y Y // Num xNaN +0 -0 // --------------- --------------- - // Num | Max | qNaN | +0 | +0 | +0 | + // Num | Max | Y | +0 | +0 | +0 | // X --------------- X --------------- - // xNaN | qNaN | qNaN | -0 | +0 | -0 | + // xNaN | X | X/Y | -0 | +0 | -0 | // --------------- --------------- // // It is achieved by means of FMAX/FMIN with preliminary checks and operand @@ -30273,15 +30273,18 @@ return false; }; - SDValue MinMax; bool IsXNeverNaN = DAG.isKnownNeverNaN(X); bool IsYNeverNaN = DAG.isKnownNeverNaN(Y); - if (DAG.getTarget().Options.NoSignedZerosFPMath || - Op->getFlags().hasNoSignedZeros() || IsPreferredZero(Y) || - DAG.isKnownNeverZeroFloat(X)) { - MinMax = DAG.getNode(MinMaxOp, DL, VT, X, Y, Op->getFlags()); + bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath || + Op->getFlags().hasNoSignedZeros(); + SDValue NewX, NewY; + if (IgnoreSignedZero || IsPreferredZero(Y) || DAG.isKnownNeverZeroFloat(X)) { + // Operands are already in right order or order does not matter. + NewX = X; + NewY = Y; } else if (IsPreferredZero(X) || DAG.isKnownNeverZeroFloat(Y)) { - MinMax = DAG.getNode(MinMaxOp, DL, VT, Y, X, Op->getFlags()); + NewX = Y; + NewY = X; } else if ((VT == MVT::f16 || Subtarget.hasDQI()) && (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) { if (IsXNeverNaN) @@ -30300,8 +30303,8 @@ DAG.getConstant(0, DL, MVT::v8i1), IsNanZero, DAG.getIntPtrConstant(0, DL)); SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins); - SDValue NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X); - SDValue NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y); + NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X); + NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y); return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); } else { SDValue IsXZero; @@ -30330,19 +30333,26 @@ IsXZero = DAG.getSetCC(DL, SetCCType, IsXZero, DAG.getConstant(0, DL, MVT::i32), ISD::SETEQ); } - SDValue NewX = DAG.getSelect(DL, VT, IsXZero, Y, X); - SDValue NewY = DAG.getSelect(DL, VT, IsXZero, X, Y); - MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); + NewX = DAG.getSelect(DL, VT, IsXZero, Y, X); + NewY = DAG.getSelect(DL, VT, IsXZero, X, Y); } - if (Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN)) + bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath || + Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN); + + // If we did no ordering operands for singed zero handling and we need + // to process NaN and we know that the second operand is not NaN then put + // it in first operand and we will not need to post handle NaN after max/min. + if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY)) + std::swap(NewX, NewY); + + SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); + + if (IgnoreNaN || DAG.isKnownNeverNaN(NewX)) return MinMax; - APFloat NaNValue = APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)); - SDValue IsNaN = DAG.getSetCC(DL, SetCCType, IsXNeverNaN ? Y : X, - IsYNeverNaN ? X : Y, ISD::SETUO); - return DAG.getSelect(DL, VT, IsNaN, DAG.getConstantFP(NaNValue, DL, VT), - MinMax); + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO); + return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, Index: llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll =================================================================== --- llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll +++ llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll @@ -14,13 +14,13 @@ ; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: sete %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm2 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vminsh %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vminsh %xmm1, %xmm2, %xmm0 -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k2} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %z = call half @llvm.minimum.f16(half %x, half %y) ret half %z @@ -79,10 +79,9 @@ define half @test_fminimum_zero(half %x, half %y) { ; CHECK-LABEL: test_fminimum_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1 ; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; CHECK-NEXT: vmovsh %xmm2, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq %1 = tail call half @llvm.minimum.f16(half -0.0, half %y) ret half %1 @@ -91,10 +90,10 @@ define half @test_fminimum_nsz(half %x, half %y) { ; CHECK-LABEL: test_fminimum_nsz: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k1 -; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = tail call nsz half @llvm.minimum.f16(half %x, half %y) ret half %1 @@ -122,13 +121,13 @@ ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: sete %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm2 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmaxsh %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmaxsh %xmm1, %xmm2, %xmm0 -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k2} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %r = call half @llvm.maximum.f16(half %x, half %y) ret half %r @@ -193,9 +192,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovsh %xmm2, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq %1 = tail call half @llvm.maximum.f16(half 0.0, half %y) ret half %1 @@ -204,10 +202,10 @@ define half @test_fmaximum_nsz(half %x, half %y) "no-signed-zeros-fp-math"="true" { ; CHECK-LABEL: test_fmaximum_nsz: ; CHECK: # %bb.0: -; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = tail call half @llvm.maximum.f16(half %x, half %y) ret half %1 Index: llvm/test/CodeGen/X86/extract-fp.ll =================================================================== --- llvm/test/CodeGen/X86/extract-fp.ll +++ llvm/test/CodeGen/X86/extract-fp.ll @@ -111,11 +111,7 @@ ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: cmpunordsd %xmm0, %xmm0 -; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: andpd %xmm0, %xmm2 -; CHECK-NEXT: andnpd %xmm1, %xmm0 -; CHECK-NEXT: orpd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: retq %v = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) %r = extractelement <2 x double> %v, i32 1 @@ -128,11 +124,7 @@ ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: minss %xmm0, %xmm1 -; CHECK-NEXT: cmpunordss %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: andps %xmm0, %xmm2 -; CHECK-NEXT: andnps %xmm1, %xmm0 -; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> ) %r = extractelement <4 x float> %v, i32 1 Index: llvm/test/CodeGen/X86/extractelement-fp.ll =================================================================== --- llvm/test/CodeGen/X86/extractelement-fp.ll +++ llvm/test/CodeGen/X86/extractelement-fp.ll @@ -680,16 +680,14 @@ ; X64-NEXT: je .LBB30_1 ; X64-NEXT: # %bb.2: ; X64-NEXT: vmovdqa %xmm1, %xmm2 -; X64-NEXT: vmovdqa %xmm0, %xmm3 ; X64-NEXT: jmp .LBB30_3 ; X64-NEXT: .LBB30_1: ; X64-NEXT: vmovdqa %xmm0, %xmm2 -; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: .LBB30_3: -; X64-NEXT: vmaxss %xmm2, %xmm3, %xmm2 -; X64-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fmaximum_v4f32: @@ -699,17 +697,15 @@ ; X86-NEXT: je .LBB30_1 ; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 ; X86-NEXT: jmp .LBB30_3 ; X86-NEXT: .LBB30_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB30_3: ; X86-NEXT: pushl %eax -; X86-NEXT: vmaxss %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -727,15 +723,14 @@ ; X64-NEXT: je .LBB31_1 ; X64-NEXT: # %bb.2: ; X64-NEXT: vmovdqa %xmm1, %xmm2 -; X64-NEXT: vmovdqa %xmm0, %xmm3 ; X64-NEXT: jmp .LBB31_3 ; X64-NEXT: .LBB31_1: ; X64-NEXT: vmovdqa %xmm0, %xmm2 -; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: .LBB31_3: -; X64-NEXT: vmaxsd %xmm2, %xmm3, %xmm2 -; X64-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; X64-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 +; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -747,19 +742,18 @@ ; X86-NEXT: je .LBB31_1 ; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 ; X86-NEXT: jmp .LBB31_3 ; X86-NEXT: .LBB31_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB31_3: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmaxsd %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -779,16 +773,14 @@ ; X64-NEXT: je .LBB32_1 ; X64-NEXT: # %bb.2: ; X64-NEXT: vmovdqa %xmm1, %xmm2 -; X64-NEXT: vmovdqa %xmm0, %xmm3 ; X64-NEXT: jmp .LBB32_3 ; X64-NEXT: .LBB32_1: ; X64-NEXT: vmovdqa %xmm0, %xmm2 -; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: .LBB32_3: -; X64-NEXT: vminss %xmm2, %xmm3, %xmm2 -; X64-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vminss %xmm2, %xmm0, %xmm1 +; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fminimum_v4f32: @@ -798,17 +790,15 @@ ; X86-NEXT: je .LBB32_1 ; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 ; X86-NEXT: jmp .LBB32_3 ; X86-NEXT: .LBB32_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB32_3: ; X86-NEXT: pushl %eax -; X86-NEXT: vminss %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -827,15 +817,14 @@ ; X64-NEXT: je .LBB33_1 ; X64-NEXT: # %bb.2: ; X64-NEXT: vmovdqa %xmm1, %xmm2 -; X64-NEXT: vmovdqa %xmm0, %xmm3 ; X64-NEXT: jmp .LBB33_3 ; X64-NEXT: .LBB33_1: ; X64-NEXT: vmovdqa %xmm0, %xmm2 -; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: .LBB33_3: -; X64-NEXT: vminsd %xmm2, %xmm3, %xmm2 -; X64-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; X64-NEXT: vminsd %xmm2, %xmm0, %xmm1 +; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -848,19 +837,18 @@ ; X86-NEXT: je .LBB33_1 ; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 ; X86-NEXT: jmp .LBB33_3 ; X86-NEXT: .LBB33_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: .LBB33_3: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vminsd %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vminsd %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp Index: llvm/test/CodeGen/X86/fminimum-fmaximum.ll =================================================================== --- llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -21,37 +21,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: maxss %xmm3, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: je .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax -; AVX1-NEXT: vmovdqa %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa %xmm1, %xmm3 -; AVX1-NEXT: je .LBB0_2 -; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: je .LBB0_1 +; AVX1-NEXT: # %bb.2: ; AVX1-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa %xmm0, %xmm3 -; AVX1-NEXT: .LBB0_2: -; AVX1-NEXT: vmaxss %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX1-NEXT: jmp .LBB0_3 +; AVX1-NEXT: .LBB0_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-NEXT: .LBB0_3: +; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximum: @@ -60,31 +63,33 @@ ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa %xmm0, %xmm2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k2 +; AVX512-NEXT: vmovaps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vmaxss %xmm1, %xmm2, %xmm0 -; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: test_fmaximum: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB0_1 +; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 -; X86-NEXT: je .LBB0_2 -; X86-NEXT: # %bb.1: +; X86-NEXT: jmp .LBB0_3 +; X86-NEXT: .LBB0_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 -; X86-NEXT: .LBB0_2: -; X86-NEXT: vmaxss %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB0_3: +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -284,8 +289,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm1, %xmm2 ; SSE2-NEXT: xorpd %xmm3, %xmm3 ; SSE2-NEXT: maxsd %xmm3, %xmm1 ; SSE2-NEXT: andnpd %xmm1, %xmm0 @@ -296,8 +301,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximum_zero0: @@ -305,7 +310,7 @@ ; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; ; X86-LABEL: test_fmaximum_zero0: @@ -317,8 +322,8 @@ ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -333,8 +338,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: andpd %xmm0, %xmm2 ; SSE2-NEXT: xorpd %xmm3, %xmm3 ; SSE2-NEXT: maxsd %xmm3, %xmm0 ; SSE2-NEXT: andnpd %xmm0, %xmm1 @@ -346,8 +351,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximum_zero1: @@ -355,7 +360,7 @@ ; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq ; @@ -368,8 +373,8 @@ ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -402,37 +407,37 @@ ; SSE2-LABEL: test_fmaximum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: maxss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: andnps %xmm2, %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum_nsz: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fmaximum_nsz: ; AVX512: # %bb.0: -; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k1 -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: test_fmaximum_nsz: ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -447,40 +452,41 @@ ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: je .LBB9_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: jmp .LBB9_3 -; SSE2-NEXT: .LBB9_1: -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: .LBB9_3: -; SSE2-NEXT: maxss %xmm2, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: je .LBB9_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: .LBB9_2: +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: je .LBB9_4 +; SSE2-NEXT: # %bb.3: ; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: .LBB9_4: +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum_combine_cmps: ; AVX1: # %bb.0: -; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: je .LBB9_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 ; AVX1-NEXT: jmp .LBB9_3 ; AVX1-NEXT: .LBB9_1: -; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: .LBB9_3: -; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: test_fmaximum_combine_cmps: @@ -492,10 +498,11 @@ ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovaps %xmm1, %xmm2 ; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} -; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k2 ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512F-NEXT: vmaxss %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512F-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmovaps %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_fmaximum_combine_cmps: @@ -514,20 +521,20 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: testl %eax, %eax ; X86-NEXT: je .LBB9_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovaps %xmm1, %xmm2 -; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm1 ; X86-NEXT: jmp .LBB9_3 ; X86-NEXT: .LBB9_1: -; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: .LBB9_3: -; X86-NEXT: vmaxss %xmm2, %xmm1, %xmm1 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -546,37 +553,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: je .LBB10_2 ; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: .LBB10_2: -; SSE2-NEXT: minss %xmm3, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: je .LBB10_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: .LBB10_4: +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimum: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 -; AVX1-NEXT: vmovdqa %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa %xmm1, %xmm3 -; AVX1-NEXT: je .LBB10_2 -; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: je .LBB10_1 +; AVX1-NEXT: # %bb.2: ; AVX1-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa %xmm0, %xmm3 -; AVX1-NEXT: .LBB10_2: -; AVX1-NEXT: vminss %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX1-NEXT: jmp .LBB10_3 +; AVX1-NEXT: .LBB10_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-NEXT: .LBB10_3: +; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum: @@ -585,31 +595,33 @@ ; AVX512-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 ; AVX512-NEXT: sete %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa %xmm0, %xmm2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k2 +; AVX512-NEXT: vmovaps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vminss %xmm1, %xmm2, %xmm0 -; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: test_fminimum: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: je .LBB10_1 +; X86-NEXT: # %bb.2: ; X86-NEXT: vmovdqa %xmm1, %xmm2 -; X86-NEXT: vmovdqa %xmm0, %xmm3 -; X86-NEXT: je .LBB10_2 -; X86-NEXT: # %bb.1: +; X86-NEXT: jmp .LBB10_3 +; X86-NEXT: .LBB10_1: ; X86-NEXT: vmovdqa %xmm0, %xmm2 -; X86-NEXT: vmovdqa %xmm1, %xmm3 -; X86-NEXT: .LBB10_2: -; X86-NEXT: vminss %xmm2, %xmm3, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: .LBB10_3: +; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -778,8 +790,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm1, %xmm2 ; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: andnpd %xmm1, %xmm0 ; SSE2-NEXT: orpd %xmm2, %xmm0 @@ -788,15 +800,15 @@ ; AVX1-LABEL: test_fminimum_zero0: ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm0 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum_zero0: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 ; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq ; ; X86-LABEL: test_fminimum_zero0: @@ -807,8 +819,8 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -823,8 +835,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: andpd %xmm0, %xmm2 ; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: andnpd %xmm0, %xmm1 ; SSE2-NEXT: orpd %xmm2, %xmm1 @@ -834,15 +846,16 @@ ; AVX1-LABEL: test_fminimum_zero1: ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum_zero1: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: test_fminimum_zero1: @@ -853,8 +866,8 @@ ; X86-NEXT: subl $8, %esp ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -888,37 +901,37 @@ ; SSE2-LABEL: test_fminimum_nsz: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: minss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: andnps %xmm2, %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimum_nsz: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum_nsz: ; AVX512: # %bb.0: -; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k1 -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; X86-LABEL: test_fminimum_nsz: ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vminss %xmm0, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -933,40 +946,41 @@ ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 -; SSE2-NEXT: je .LBB19_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: jmp .LBB19_3 -; SSE2-NEXT: .LBB19_1: -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: .LBB19_3: -; SSE2-NEXT: minss %xmm2, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: je .LBB19_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: .LBB19_2: +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: je .LBB19_4 +; SSE2-NEXT: # %bb.3: ; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: .LBB19_4: +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fminimum_combine_cmps: ; AVX1: # %bb.0: -; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: je .LBB19_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm1 ; AVX1-NEXT: jmp .LBB19_3 ; AVX1-NEXT: .LBB19_1: -; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: vmovaps %xmm2, %xmm0 ; AVX1-NEXT: .LBB19_3: -; AVX1-NEXT: vminss %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: test_fminimum_combine_cmps: @@ -978,10 +992,11 @@ ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovaps %xmm1, %xmm2 ; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} -; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k2 ; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmovaps %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_fminimum_combine_cmps: @@ -1000,20 +1015,20 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 ; X86-NEXT: je .LBB19_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovaps %xmm1, %xmm2 -; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm1 ; X86-NEXT: jmp .LBB19_3 ; X86-NEXT: .LBB19_1: -; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: .LBB19_3: -; X86-NEXT: vminss %xmm2, %xmm1, %xmm1 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax