diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20407,8 +20407,8 @@ // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // - // Adjust = (Value < Thresh) ? 0 : 0x80000000; - // FltOfs = (Value < Thresh) ? 0 : 0x80000000; + // Adjust = (Value >= Thresh) ? 0x80000000 : 0; + // FltOfs = (Value >= Thresh) ? 0x80000000 : 0; // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent @@ -20438,20 +20438,30 @@ *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { - Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, - Chain, /*IsSignaling*/ true); + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, + /*IsSignaling*/ true); Chain = Cmp.getValue(1); } else { - Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT); + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE); } - Adjust = DAG.getSelect(DL, MVT::i64, Cmp, - DAG.getConstant(0, DL, MVT::i64), - DAG.getConstant(APInt::getSignMask(64), - DL, MVT::i64)); - SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, - DAG.getConstantFP(0.0, DL, TheVT), - ThreshVal); + // Our preferred lowering of + // + // (Value >= Thresh) ? 0x8000000000000000ULL : 0 + // + // is + // + // (Value >= Thresh) << 63 + // + // but since we can get here after LegalOperations, DAGCombine might do the + // wrong thing if we create a select. So, directly create the preferred + // version. + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp); + SDValue Const63 = DAG.getConstant(63, DL, MVT::i8); + Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63); + + SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal, + DAG.getConstantFP(0.0, DL, TheVT)); if (IsStrict) { Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll --- a/llvm/test/CodeGen/X86/fp-cvt.ll +++ b/llvm/test/CodeGen/X86/fp-cvt.ll @@ -451,13 +451,12 @@ ; X86-NEXT: sahf ; X86-NEXT: setbe %al ; X86-NEXT: fldz -; X86-NEXT: ja .LBB10_2 +; X86-NEXT: jbe .LBB10_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: fstp %st(0) +; X86-NEXT: fstp %st(1) ; X86-NEXT: fldz -; X86-NEXT: fxch %st(1) ; X86-NEXT: .LBB10_2: -; X86-NEXT: fstp %st(1) +; X86-NEXT: fstp %st(0) ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -482,8 +481,7 @@ ; X64-X87-NEXT: fucomi %st(1), %st ; X64-X87-NEXT: setbe %al ; X64-X87-NEXT: fldz -; X64-X87-NEXT: fxch %st(1) -; X64-X87-NEXT: fcmovnbe %st(1), %st +; X64-X87-NEXT: fcmovbe %st(1), %st ; X64-X87-NEXT: fstp %st(1) ; X64-X87-NEXT: fsubrp %st, %st(1) ; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -504,8 +502,7 @@ ; X64-SSSE3-NEXT: xorl %eax, %eax ; X64-SSSE3-NEXT: fucomi %st(1), %st ; X64-SSSE3-NEXT: fldz -; X64-SSSE3-NEXT: fxch %st(1) -; X64-SSSE3-NEXT: fcmovnbe %st(1), %st +; X64-SSSE3-NEXT: fcmovbe %st(1), %st ; X64-SSSE3-NEXT: fstp %st(1) ; X64-SSSE3-NEXT: fsubrp %st, %st(1) ; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) @@ -534,13 +531,12 @@ ; X86-NEXT: sahf ; X86-NEXT: setbe %al ; X86-NEXT: fldz -; X86-NEXT: ja .LBB11_2 +; X86-NEXT: jbe .LBB11_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: fstp %st(0) +; X86-NEXT: fstp %st(1) ; X86-NEXT: fldz -; X86-NEXT: fxch %st(1) ; X86-NEXT: .LBB11_2: -; X86-NEXT: fstp %st(1) +; X86-NEXT: fstp %st(0) ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -565,8 +561,7 @@ ; X64-X87-NEXT: fucomi %st(1), %st ; X64-X87-NEXT: setbe %al ; X64-X87-NEXT: fldz -; X64-X87-NEXT: fxch %st(1) -; X64-X87-NEXT: fcmovnbe %st(1), %st +; X64-X87-NEXT: fcmovbe %st(1), %st ; X64-X87-NEXT: fstp %st(1) ; X64-X87-NEXT: fsubrp %st, %st(1) ; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -587,8 +582,7 @@ ; X64-SSSE3-NEXT: xorl %eax, %eax ; X64-SSSE3-NEXT: fucomi %st(1), %st ; X64-SSSE3-NEXT: fldz -; X64-SSSE3-NEXT: fxch %st(1) -; X64-SSSE3-NEXT: fcmovnbe %st(1), %st +; X64-SSSE3-NEXT: fcmovbe %st(1), %st ; X64-SSSE3-NEXT: fstp %st(1) ; X64-SSSE3-NEXT: fsubrp %st, %st(1) ; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll @@ -32,7 +32,7 @@ ; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr ; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD ; CHECK: JCC_1 -; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}} +; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[FsFLD0SD]], {{.*}}, [[MOVSDrm_alt1]], {{.*}} ; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr ; CHECK: MOVSDmr %stack.0, 1, $noreg, 0, $noreg, killed [[SUBSDrr]] :: (store 8 into %stack.0) ; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1363,8 +1363,7 @@ ; X87-NEXT: wait ; X87-NEXT: setbe %dl ; X87-NEXT: fldz -; X87-NEXT: fxch %st(1) -; X87-NEXT: fcmovnbe %st(1), %st +; X87-NEXT: fcmovbe %st(1), %st ; X87-NEXT: fstp %st(1) ; X87-NEXT: fsubrp %st, %st(1) ; X87-NEXT: wait @@ -1387,12 +1386,11 @@ ; X86-SSE-NEXT: subl $20, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 24 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X86-SSE-NEXT: comisd %xmm0, %xmm2 -; X86-SSE-NEXT: xorpd %xmm1, %xmm1 -; X86-SSE-NEXT: ja .LBB25_2 +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: comisd %xmm0, %xmm1 +; X86-SSE-NEXT: jbe .LBB25_2 ; X86-SSE-NEXT: # %bb.1: # %entry -; X86-SSE-NEXT: movapd %xmm2, %xmm1 +; X86-SSE-NEXT: xorpd %xmm1, %xmm1 ; X86-SSE-NEXT: .LBB25_2: # %entry ; X86-SSE-NEXT: subsd %xmm1, %xmm0 ; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -543,12 +543,11 @@ ; SSE-X86-NEXT: andl $-8, %esp ; SSE-X86-NEXT: subl $16, %esp ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-X86-NEXT: comiss %xmm0, %xmm2 -; SSE-X86-NEXT: xorps %xmm1, %xmm1 -; SSE-X86-NEXT: ja .LBB9_2 +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: comiss %xmm0, %xmm1 +; SSE-X86-NEXT: jbe .LBB9_2 ; SSE-X86-NEXT: # %bb.1: -; SSE-X86-NEXT: movaps %xmm2, %xmm1 +; SSE-X86-NEXT: xorps %xmm1, %xmm1 ; SSE-X86-NEXT: .LBB9_2: ; SSE-X86-NEXT: subss %xmm1, %xmm0 ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) @@ -600,12 +599,11 @@ ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-X86-NEXT: vcomiss %xmm0, %xmm1 -; AVX1-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-X86-NEXT: ja .LBB9_2 +; AVX1-X86-NEXT: jbe .LBB9_2 ; AVX1-X86-NEXT: # %bb.1: -; AVX1-X86-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-X86-NEXT: .LBB9_2: -; AVX1-X86-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX1-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX1-X86-NEXT: vmovss %xmm0, (%esp) ; AVX1-X86-NEXT: flds (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) @@ -650,16 +648,14 @@ ; AVX512-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512-X86-NEXT: xorl %edx, %edx ; AVX512-X86-NEXT: vcomiss %xmm0, %xmm1 -; AVX512-X86-NEXT: seta %al -; AVX512-X86-NEXT: kmovw %eax, %k1 -; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-X86-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-X86-NEXT: setbe %dl +; AVX512-X86-NEXT: kmovw %edx, %k1 +; AVX512-X86-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512-X86-NEXT: vmovss %xmm0, (%esp) ; AVX512-X86-NEXT: flds (%esp) ; AVX512-X86-NEXT: fisttpll (%esp) ; AVX512-X86-NEXT: wait -; AVX512-X86-NEXT: setbe %dl ; AVX512-X86-NEXT: shll $31, %edx ; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512-X86-NEXT: movl (%esp), %eax @@ -692,13 +688,12 @@ ; CHECK-NEXT: sahf ; CHECK-NEXT: setbe %al ; CHECK-NEXT: fldz -; CHECK-NEXT: ja .LBB9_2 +; CHECK-NEXT: jbe .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fstp %st(0) +; CHECK-NEXT: fstp %st(1) ; CHECK-NEXT: fldz -; CHECK-NEXT: fxch %st(1) ; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fstp %st(0) ; CHECK-NEXT: fsubrp %st, %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) @@ -1188,12 +1183,11 @@ ; SSE-X86-NEXT: andl $-8, %esp ; SSE-X86-NEXT: subl $16, %esp ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-X86-NEXT: comisd %xmm0, %xmm2 -; SSE-X86-NEXT: xorpd %xmm1, %xmm1 -; SSE-X86-NEXT: ja .LBB18_2 +; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X86-NEXT: comisd %xmm0, %xmm1 +; SSE-X86-NEXT: jbe .LBB18_2 ; SSE-X86-NEXT: # %bb.1: -; SSE-X86-NEXT: movapd %xmm2, %xmm1 +; SSE-X86-NEXT: xorpd %xmm1, %xmm1 ; SSE-X86-NEXT: .LBB18_2: ; SSE-X86-NEXT: subsd %xmm1, %xmm0 ; SSE-X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) @@ -1245,12 +1239,11 @@ ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-X86-NEXT: vcomisd %xmm0, %xmm1 -; AVX1-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX1-X86-NEXT: ja .LBB18_2 +; AVX1-X86-NEXT: jbe .LBB18_2 ; AVX1-X86-NEXT: # %bb.1: -; AVX1-X86-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX1-X86-NEXT: .LBB18_2: -; AVX1-X86-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX1-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX1-X86-NEXT: fldl (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) @@ -1295,16 +1288,14 @@ ; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512-X86-NEXT: xorl %edx, %edx ; AVX512-X86-NEXT: vcomisd %xmm0, %xmm1 -; AVX512-X86-NEXT: seta %al -; AVX512-X86-NEXT: kmovw %eax, %k1 -; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX512-X86-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-X86-NEXT: setbe %dl +; AVX512-X86-NEXT: kmovw %edx, %k1 +; AVX512-X86-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX512-X86-NEXT: fldl (%esp) ; AVX512-X86-NEXT: fisttpll (%esp) ; AVX512-X86-NEXT: wait -; AVX512-X86-NEXT: setbe %dl ; AVX512-X86-NEXT: shll $31, %edx ; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512-X86-NEXT: movl (%esp), %eax @@ -1337,13 +1328,12 @@ ; CHECK-NEXT: sahf ; CHECK-NEXT: setbe %al ; CHECK-NEXT: fldz -; CHECK-NEXT: ja .LBB18_2 +; CHECK-NEXT: jbe .LBB18_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: fstp %st(0) +; CHECK-NEXT: fstp %st(1) ; CHECK-NEXT: fldz -; CHECK-NEXT: fxch %st(1) ; CHECK-NEXT: .LBB18_2: -; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fstp %st(0) ; CHECK-NEXT: fsubrp %st, %st(1) ; CHECK-NEXT: wait ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll --- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll @@ -597,13 +597,12 @@ ; X86-NEXT: sahf ; X86-NEXT: setbe %al ; X86-NEXT: fldz -; X86-NEXT: ja .LBB18_2 +; X86-NEXT: jbe .LBB18_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: fstp %st(0) +; X86-NEXT: fstp %st(1) ; X86-NEXT: fldz -; X86-NEXT: fxch %st(1) ; X86-NEXT: .LBB18_2: -; X86-NEXT: fstp %st(1) +; X86-NEXT: fstp %st(0) ; X86-NEXT: fsubrp %st, %st(1) ; X86-NEXT: wait ; X86-NEXT: fnstcw {{[0-9]+}}(%esp) @@ -632,8 +631,7 @@ ; X64-NEXT: wait ; X64-NEXT: setbe %al ; X64-NEXT: fldz -; X64-NEXT: fxch %st(1) -; X64-NEXT: fcmovnbe %st(1), %st +; X64-NEXT: fcmovbe %st(1), %st ; X64-NEXT: fstp %st(1) ; X64-NEXT: fsubrp %st, %st(1) ; X64-NEXT: wait diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -69,16 +69,15 @@ ; X86-AVX512F-WIN-NEXT: subl $8, %esp ; X86-AVX512F-WIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512F-WIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-WIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 -; X86-AVX512F-WIN-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX512F-WIN-NEXT: xorl %edx, %edx ; X86-AVX512F-WIN-NEXT: vucomiss %xmm0, %xmm1 -; X86-AVX512F-WIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; X86-AVX512F-WIN-NEXT: setbe %dl +; X86-AVX512F-WIN-NEXT: kmovw %edx, %k1 +; X86-AVX512F-WIN-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} ; X86-AVX512F-WIN-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; X86-AVX512F-WIN-NEXT: vmovss %xmm0, (%esp) ; X86-AVX512F-WIN-NEXT: flds (%esp) ; X86-AVX512F-WIN-NEXT: fisttpll (%esp) -; X86-AVX512F-WIN-NEXT: setbe %dl ; X86-AVX512F-WIN-NEXT: shll $31, %edx ; X86-AVX512F-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512F-WIN-NEXT: movl (%esp), %eax @@ -91,16 +90,15 @@ ; X86-AVX512F-LIN-NEXT: subl $12, %esp ; X86-AVX512F-LIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512F-LIN-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-LIN-NEXT: vcmpltss %xmm1, %xmm0, %k1 -; X86-AVX512F-LIN-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX512F-LIN-NEXT: xorl %edx, %edx ; X86-AVX512F-LIN-NEXT: vucomiss %xmm0, %xmm1 -; X86-AVX512F-LIN-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; X86-AVX512F-LIN-NEXT: setbe %dl +; X86-AVX512F-LIN-NEXT: kmovw %edx, %k1 +; X86-AVX512F-LIN-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} ; X86-AVX512F-LIN-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; X86-AVX512F-LIN-NEXT: vmovss %xmm0, (%esp) ; X86-AVX512F-LIN-NEXT: flds (%esp) ; X86-AVX512F-LIN-NEXT: fisttpll (%esp) -; X86-AVX512F-LIN-NEXT: setbe %dl ; X86-AVX512F-LIN-NEXT: shll $31, %edx ; X86-AVX512F-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512F-LIN-NEXT: movl (%esp), %eax @@ -115,16 +113,17 @@ ; X86-SSE3-WIN-NEXT: subl $8, %esp ; X86-SSE3-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE3-WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE3-WIN-NEXT: movaps %xmm0, %xmm2 -; X86-SSE3-WIN-NEXT: xorl %edx, %edx ; X86-SSE3-WIN-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE3-WIN-NEXT: cmpltss %xmm1, %xmm0 -; X86-SSE3-WIN-NEXT: andnps %xmm1, %xmm0 -; X86-SSE3-WIN-NEXT: subss %xmm0, %xmm2 -; X86-SSE3-WIN-NEXT: movss %xmm2, (%esp) +; X86-SSE3-WIN-NEXT: jbe LBB0_2 +; X86-SSE3-WIN-NEXT: # %bb.1: +; X86-SSE3-WIN-NEXT: xorps %xmm1, %xmm1 +; X86-SSE3-WIN-NEXT: LBB0_2: +; X86-SSE3-WIN-NEXT: subss %xmm1, %xmm0 +; X86-SSE3-WIN-NEXT: movss %xmm0, (%esp) ; X86-SSE3-WIN-NEXT: flds (%esp) ; X86-SSE3-WIN-NEXT: fisttpll (%esp) -; X86-SSE3-WIN-NEXT: setbe %dl +; X86-SSE3-WIN-NEXT: setbe %al +; X86-SSE3-WIN-NEXT: movzbl %al, %edx ; X86-SSE3-WIN-NEXT: shll $31, %edx ; X86-SSE3-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-WIN-NEXT: movl (%esp), %eax @@ -137,16 +136,17 @@ ; X86-SSE3-LIN-NEXT: subl $12, %esp ; X86-SSE3-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE3-LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE3-LIN-NEXT: movaps %xmm0, %xmm2 -; X86-SSE3-LIN-NEXT: xorl %edx, %edx ; X86-SSE3-LIN-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE3-LIN-NEXT: cmpltss %xmm1, %xmm0 -; X86-SSE3-LIN-NEXT: andnps %xmm1, %xmm0 -; X86-SSE3-LIN-NEXT: subss %xmm0, %xmm2 -; X86-SSE3-LIN-NEXT: movss %xmm2, (%esp) +; X86-SSE3-LIN-NEXT: jbe .LBB0_2 +; X86-SSE3-LIN-NEXT: # %bb.1: +; X86-SSE3-LIN-NEXT: xorps %xmm1, %xmm1 +; X86-SSE3-LIN-NEXT: .LBB0_2: +; X86-SSE3-LIN-NEXT: subss %xmm1, %xmm0 +; X86-SSE3-LIN-NEXT: movss %xmm0, (%esp) ; X86-SSE3-LIN-NEXT: flds (%esp) ; X86-SSE3-LIN-NEXT: fisttpll (%esp) -; X86-SSE3-LIN-NEXT: setbe %dl +; X86-SSE3-LIN-NEXT: setbe %al +; X86-SSE3-LIN-NEXT: movzbl %al, %edx ; X86-SSE3-LIN-NEXT: shll $31, %edx ; X86-SSE3-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-LIN-NEXT: movl (%esp), %eax @@ -174,23 +174,23 @@ ; X86-SSE2-WIN-NEXT: subl $16, %esp ; X86-SSE2-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE2-WIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-WIN-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-WIN-NEXT: cmpltss %xmm1, %xmm2 -; X86-SSE2-WIN-NEXT: andnps %xmm1, %xmm2 -; X86-SSE2-WIN-NEXT: movaps %xmm0, %xmm3 -; X86-SSE2-WIN-NEXT: subss %xmm2, %xmm3 -; X86-SSE2-WIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: ucomiss %xmm0, %xmm1 +; X86-SSE2-WIN-NEXT: jbe LBB0_2 +; X86-SSE2-WIN-NEXT: # %bb.1: +; X86-SSE2-WIN-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-WIN-NEXT: LBB0_2: +; X86-SSE2-WIN-NEXT: subss %xmm1, %xmm0 +; X86-SSE2-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: setbe %al ; X86-SSE2-WIN-NEXT: flds {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE2-WIN-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE2-WIN-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-WIN-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE2-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE2-WIN-NEXT: xorl %edx, %edx -; X86-SSE2-WIN-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE2-WIN-NEXT: setbe %dl +; X86-SSE2-WIN-NEXT: movzbl %al, %edx ; X86-SSE2-WIN-NEXT: shll $31, %edx ; X86-SSE2-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE2-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -203,23 +203,23 @@ ; X86-SSE2-LIN-NEXT: subl $20, %esp ; X86-SSE2-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE2-LIN-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-LIN-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-LIN-NEXT: cmpltss %xmm1, %xmm2 -; X86-SSE2-LIN-NEXT: andnps %xmm1, %xmm2 -; X86-SSE2-LIN-NEXT: movaps %xmm0, %xmm3 -; X86-SSE2-LIN-NEXT: subss %xmm2, %xmm3 -; X86-SSE2-LIN-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: ucomiss %xmm0, %xmm1 +; X86-SSE2-LIN-NEXT: jbe .LBB0_2 +; X86-SSE2-LIN-NEXT: # %bb.1: +; X86-SSE2-LIN-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-LIN-NEXT: .LBB0_2: +; X86-SSE2-LIN-NEXT: subss %xmm1, %xmm0 +; X86-SSE2-LIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: setbe %al ; X86-SSE2-LIN-NEXT: flds {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE2-LIN-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE2-LIN-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-LIN-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE2-LIN-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE2-LIN-NEXT: xorl %edx, %edx -; X86-SSE2-LIN-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE2-LIN-NEXT: setbe %dl +; X86-SSE2-LIN-NEXT: movzbl %al, %edx ; X86-SSE2-LIN-NEXT: shll $31, %edx ; X86-SSE2-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE2-LIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -241,13 +241,12 @@ ; X87-WIN-NEXT: sahf ; X87-WIN-NEXT: setbe %al ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: ja LBB0_2 +; X87-WIN-NEXT: jbe LBB0_2 ; X87-WIN-NEXT: # %bb.1: -; X87-WIN-NEXT: fstp %st(0) +; X87-WIN-NEXT: fstp %st(1) ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: fxch %st(1) ; X87-WIN-NEXT: LBB0_2: -; X87-WIN-NEXT: fstp %st(1) +; X87-WIN-NEXT: fstp %st(0) ; X87-WIN-NEXT: fsubrp %st, %st(1) ; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -276,13 +275,12 @@ ; X87-LIN-NEXT: sahf ; X87-LIN-NEXT: setbe %al ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: ja .LBB0_2 +; X87-LIN-NEXT: jbe .LBB0_2 ; X87-LIN-NEXT: # %bb.1: -; X87-LIN-NEXT: fstp %st(0) +; X87-LIN-NEXT: fstp %st(1) ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: fxch %st(1) ; X87-LIN-NEXT: .LBB0_2: -; X87-LIN-NEXT: fstp %st(1) +; X87-LIN-NEXT: fstp %st(0) ; X87-LIN-NEXT: fsubrp %st, %st(1) ; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -495,16 +493,15 @@ ; X86-AVX512F-WIN-NEXT: subl $8, %esp ; X86-AVX512F-WIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-WIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-WIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 -; X86-AVX512F-WIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; X86-AVX512F-WIN-NEXT: xorl %edx, %edx ; X86-AVX512F-WIN-NEXT: vucomisd %xmm0, %xmm1 -; X86-AVX512F-WIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; X86-AVX512F-WIN-NEXT: setbe %dl +; X86-AVX512F-WIN-NEXT: kmovw %edx, %k1 +; X86-AVX512F-WIN-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; X86-AVX512F-WIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; X86-AVX512F-WIN-NEXT: vmovsd %xmm0, (%esp) ; X86-AVX512F-WIN-NEXT: fldl (%esp) ; X86-AVX512F-WIN-NEXT: fisttpll (%esp) -; X86-AVX512F-WIN-NEXT: setbe %dl ; X86-AVX512F-WIN-NEXT: shll $31, %edx ; X86-AVX512F-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512F-WIN-NEXT: movl (%esp), %eax @@ -517,16 +514,15 @@ ; X86-AVX512F-LIN-NEXT: subl $12, %esp ; X86-AVX512F-LIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-LIN-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-LIN-NEXT: vcmpltsd %xmm1, %xmm0, %k1 -; X86-AVX512F-LIN-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; X86-AVX512F-LIN-NEXT: xorl %edx, %edx ; X86-AVX512F-LIN-NEXT: vucomisd %xmm0, %xmm1 -; X86-AVX512F-LIN-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; X86-AVX512F-LIN-NEXT: setbe %dl +; X86-AVX512F-LIN-NEXT: kmovw %edx, %k1 +; X86-AVX512F-LIN-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; X86-AVX512F-LIN-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; X86-AVX512F-LIN-NEXT: vmovsd %xmm0, (%esp) ; X86-AVX512F-LIN-NEXT: fldl (%esp) ; X86-AVX512F-LIN-NEXT: fisttpll (%esp) -; X86-AVX512F-LIN-NEXT: setbe %dl ; X86-AVX512F-LIN-NEXT: shll $31, %edx ; X86-AVX512F-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512F-LIN-NEXT: movl (%esp), %eax @@ -541,16 +537,17 @@ ; X86-SSE3-WIN-NEXT: subl $8, %esp ; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE3-WIN-NEXT: movapd %xmm0, %xmm2 -; X86-SSE3-WIN-NEXT: xorl %edx, %edx ; X86-SSE3-WIN-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE3-WIN-NEXT: cmpltsd %xmm1, %xmm0 -; X86-SSE3-WIN-NEXT: andnpd %xmm1, %xmm0 -; X86-SSE3-WIN-NEXT: subsd %xmm0, %xmm2 -; X86-SSE3-WIN-NEXT: movsd %xmm2, (%esp) +; X86-SSE3-WIN-NEXT: jbe LBB2_2 +; X86-SSE3-WIN-NEXT: # %bb.1: +; X86-SSE3-WIN-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE3-WIN-NEXT: LBB2_2: +; X86-SSE3-WIN-NEXT: subsd %xmm1, %xmm0 +; X86-SSE3-WIN-NEXT: movsd %xmm0, (%esp) ; X86-SSE3-WIN-NEXT: fldl (%esp) ; X86-SSE3-WIN-NEXT: fisttpll (%esp) -; X86-SSE3-WIN-NEXT: setbe %dl +; X86-SSE3-WIN-NEXT: setbe %al +; X86-SSE3-WIN-NEXT: movzbl %al, %edx ; X86-SSE3-WIN-NEXT: shll $31, %edx ; X86-SSE3-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-WIN-NEXT: movl (%esp), %eax @@ -563,16 +560,17 @@ ; X86-SSE3-LIN-NEXT: subl $12, %esp ; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE3-LIN-NEXT: movapd %xmm0, %xmm2 -; X86-SSE3-LIN-NEXT: xorl %edx, %edx ; X86-SSE3-LIN-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE3-LIN-NEXT: cmpltsd %xmm1, %xmm0 -; X86-SSE3-LIN-NEXT: andnpd %xmm1, %xmm0 -; X86-SSE3-LIN-NEXT: subsd %xmm0, %xmm2 -; X86-SSE3-LIN-NEXT: movsd %xmm2, (%esp) +; X86-SSE3-LIN-NEXT: jbe .LBB2_2 +; X86-SSE3-LIN-NEXT: # %bb.1: +; X86-SSE3-LIN-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE3-LIN-NEXT: .LBB2_2: +; X86-SSE3-LIN-NEXT: subsd %xmm1, %xmm0 +; X86-SSE3-LIN-NEXT: movsd %xmm0, (%esp) ; X86-SSE3-LIN-NEXT: fldl (%esp) ; X86-SSE3-LIN-NEXT: fisttpll (%esp) -; X86-SSE3-LIN-NEXT: setbe %dl +; X86-SSE3-LIN-NEXT: setbe %al +; X86-SSE3-LIN-NEXT: movzbl %al, %edx ; X86-SSE3-LIN-NEXT: shll $31, %edx ; X86-SSE3-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-LIN-NEXT: movl (%esp), %eax @@ -600,23 +598,23 @@ ; X86-SSE2-WIN-NEXT: subl $16, %esp ; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE2-WIN-NEXT: movapd %xmm0, %xmm2 -; X86-SSE2-WIN-NEXT: cmpltsd %xmm1, %xmm2 -; X86-SSE2-WIN-NEXT: andnpd %xmm1, %xmm2 -; X86-SSE2-WIN-NEXT: movapd %xmm0, %xmm3 -; X86-SSE2-WIN-NEXT: subsd %xmm2, %xmm3 -; X86-SSE2-WIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: ucomisd %xmm0, %xmm1 +; X86-SSE2-WIN-NEXT: jbe LBB2_2 +; X86-SSE2-WIN-NEXT: # %bb.1: +; X86-SSE2-WIN-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE2-WIN-NEXT: LBB2_2: +; X86-SSE2-WIN-NEXT: subsd %xmm1, %xmm0 +; X86-SSE2-WIN-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: setbe %al ; X86-SSE2-WIN-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE2-WIN-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE2-WIN-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE2-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-WIN-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE2-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE2-WIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE2-WIN-NEXT: xorl %edx, %edx -; X86-SSE2-WIN-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE2-WIN-NEXT: setbe %dl +; X86-SSE2-WIN-NEXT: movzbl %al, %edx ; X86-SSE2-WIN-NEXT: shll $31, %edx ; X86-SSE2-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE2-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -629,23 +627,23 @@ ; X86-SSE2-LIN-NEXT: subl $20, %esp ; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE2-LIN-NEXT: movapd %xmm0, %xmm2 -; X86-SSE2-LIN-NEXT: cmpltsd %xmm1, %xmm2 -; X86-SSE2-LIN-NEXT: andnpd %xmm1, %xmm2 -; X86-SSE2-LIN-NEXT: movapd %xmm0, %xmm3 -; X86-SSE2-LIN-NEXT: subsd %xmm2, %xmm3 -; X86-SSE2-LIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: ucomisd %xmm0, %xmm1 +; X86-SSE2-LIN-NEXT: jbe .LBB2_2 +; X86-SSE2-LIN-NEXT: # %bb.1: +; X86-SSE2-LIN-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE2-LIN-NEXT: .LBB2_2: +; X86-SSE2-LIN-NEXT: subsd %xmm1, %xmm0 +; X86-SSE2-LIN-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: setbe %al ; X86-SSE2-LIN-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE2-LIN-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE2-LIN-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE2-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-LIN-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE2-LIN-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE2-LIN-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE2-LIN-NEXT: xorl %edx, %edx -; X86-SSE2-LIN-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE2-LIN-NEXT: setbe %dl +; X86-SSE2-LIN-NEXT: movzbl %al, %edx ; X86-SSE2-LIN-NEXT: shll $31, %edx ; X86-SSE2-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE2-LIN-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -667,13 +665,12 @@ ; X87-WIN-NEXT: sahf ; X87-WIN-NEXT: setbe %al ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: ja LBB2_2 +; X87-WIN-NEXT: jbe LBB2_2 ; X87-WIN-NEXT: # %bb.1: -; X87-WIN-NEXT: fstp %st(0) +; X87-WIN-NEXT: fstp %st(1) ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: fxch %st(1) ; X87-WIN-NEXT: LBB2_2: -; X87-WIN-NEXT: fstp %st(1) +; X87-WIN-NEXT: fstp %st(0) ; X87-WIN-NEXT: fsubrp %st, %st(1) ; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -702,13 +699,12 @@ ; X87-LIN-NEXT: sahf ; X87-LIN-NEXT: setbe %al ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: ja .LBB2_2 +; X87-LIN-NEXT: jbe .LBB2_2 ; X87-LIN-NEXT: # %bb.1: -; X87-LIN-NEXT: fstp %st(0) +; X87-LIN-NEXT: fstp %st(1) ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: fxch %st(1) ; X87-LIN-NEXT: .LBB2_2: -; X87-LIN-NEXT: fstp %st(1) +; X87-LIN-NEXT: fstp %st(0) ; X87-LIN-NEXT: fsubrp %st, %st(1) ; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -902,8 +898,7 @@ ; X86-AVX512-WIN-NEXT: xorl %edx, %edx ; X86-AVX512-WIN-NEXT: fucomi %st(1), %st ; X86-AVX512-WIN-NEXT: fldz -; X86-AVX512-WIN-NEXT: fxch %st(1) -; X86-AVX512-WIN-NEXT: fcmovnbe %st(1), %st +; X86-AVX512-WIN-NEXT: fcmovbe %st(1), %st ; X86-AVX512-WIN-NEXT: fstp %st(1) ; X86-AVX512-WIN-NEXT: fsubrp %st, %st(1) ; X86-AVX512-WIN-NEXT: fisttpll (%esp) @@ -923,8 +918,7 @@ ; X86-AVX512-LIN-NEXT: xorl %edx, %edx ; X86-AVX512-LIN-NEXT: fucomi %st(1), %st ; X86-AVX512-LIN-NEXT: fldz -; X86-AVX512-LIN-NEXT: fxch %st(1) -; X86-AVX512-LIN-NEXT: fcmovnbe %st(1), %st +; X86-AVX512-LIN-NEXT: fcmovbe %st(1), %st ; X86-AVX512-LIN-NEXT: fstp %st(1) ; X86-AVX512-LIN-NEXT: fsubrp %st, %st(1) ; X86-AVX512-LIN-NEXT: fisttpll (%esp) @@ -943,8 +937,7 @@ ; X64-AVX512-WIN-NEXT: xorl %eax, %eax ; X64-AVX512-WIN-NEXT: fucomi %st(1), %st ; X64-AVX512-WIN-NEXT: fldz -; X64-AVX512-WIN-NEXT: fxch %st(1) -; X64-AVX512-WIN-NEXT: fcmovnbe %st(1), %st +; X64-AVX512-WIN-NEXT: fcmovbe %st(1), %st ; X64-AVX512-WIN-NEXT: fstp %st(1) ; X64-AVX512-WIN-NEXT: fsubrp %st, %st(1) ; X64-AVX512-WIN-NEXT: fisttpll (%rsp) @@ -961,8 +954,7 @@ ; X64-AVX512-LIN-NEXT: xorl %eax, %eax ; X64-AVX512-LIN-NEXT: fucomi %st(1), %st ; X64-AVX512-LIN-NEXT: fldz -; X64-AVX512-LIN-NEXT: fxch %st(1) -; X64-AVX512-LIN-NEXT: fcmovnbe %st(1), %st +; X64-AVX512-LIN-NEXT: fcmovbe %st(1), %st ; X64-AVX512-LIN-NEXT: fstp %st(1) ; X64-AVX512-LIN-NEXT: fsubrp %st, %st(1) ; X64-AVX512-LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp) @@ -982,8 +974,7 @@ ; X86-SSE3-WIN-NEXT: xorl %edx, %edx ; X86-SSE3-WIN-NEXT: fucomi %st(1), %st ; X86-SSE3-WIN-NEXT: fldz -; X86-SSE3-WIN-NEXT: fxch %st(1) -; X86-SSE3-WIN-NEXT: fcmovnbe %st(1), %st +; X86-SSE3-WIN-NEXT: fcmovbe %st(1), %st ; X86-SSE3-WIN-NEXT: fstp %st(1) ; X86-SSE3-WIN-NEXT: fsubrp %st, %st(1) ; X86-SSE3-WIN-NEXT: fisttpll (%esp) @@ -1003,8 +994,7 @@ ; X86-SSE3-LIN-NEXT: xorl %edx, %edx ; X86-SSE3-LIN-NEXT: fucomi %st(1), %st ; X86-SSE3-LIN-NEXT: fldz -; X86-SSE3-LIN-NEXT: fxch %st(1) -; X86-SSE3-LIN-NEXT: fcmovnbe %st(1), %st +; X86-SSE3-LIN-NEXT: fcmovbe %st(1), %st ; X86-SSE3-LIN-NEXT: fstp %st(1) ; X86-SSE3-LIN-NEXT: fsubrp %st, %st(1) ; X86-SSE3-LIN-NEXT: fisttpll (%esp) @@ -1023,8 +1013,7 @@ ; X64-SSE3-WIN-NEXT: xorl %eax, %eax ; X64-SSE3-WIN-NEXT: fucomi %st(1), %st ; X64-SSE3-WIN-NEXT: fldz -; X64-SSE3-WIN-NEXT: fxch %st(1) -; X64-SSE3-WIN-NEXT: fcmovnbe %st(1), %st +; X64-SSE3-WIN-NEXT: fcmovbe %st(1), %st ; X64-SSE3-WIN-NEXT: fstp %st(1) ; X64-SSE3-WIN-NEXT: fsubrp %st, %st(1) ; X64-SSE3-WIN-NEXT: fisttpll (%rsp) @@ -1041,8 +1030,7 @@ ; X64-SSE3-LIN-NEXT: xorl %eax, %eax ; X64-SSE3-LIN-NEXT: fucomi %st(1), %st ; X64-SSE3-LIN-NEXT: fldz -; X64-SSE3-LIN-NEXT: fxch %st(1) -; X64-SSE3-LIN-NEXT: fcmovnbe %st(1), %st +; X64-SSE3-LIN-NEXT: fcmovbe %st(1), %st ; X64-SSE3-LIN-NEXT: fstp %st(1) ; X64-SSE3-LIN-NEXT: fsubrp %st, %st(1) ; X64-SSE3-LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp) @@ -1063,8 +1051,7 @@ ; X86-SSE2-WIN-NEXT: fucomi %st(1), %st ; X86-SSE2-WIN-NEXT: setbe %dl ; X86-SSE2-WIN-NEXT: fldz -; X86-SSE2-WIN-NEXT: fxch %st(1) -; X86-SSE2-WIN-NEXT: fcmovnbe %st(1), %st +; X86-SSE2-WIN-NEXT: fcmovbe %st(1), %st ; X86-SSE2-WIN-NEXT: fstp %st(1) ; X86-SSE2-WIN-NEXT: fsubrp %st, %st(1) ; X86-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) @@ -1090,8 +1077,7 @@ ; X86-SSE2-LIN-NEXT: fucomi %st(1), %st ; X86-SSE2-LIN-NEXT: setbe %dl ; X86-SSE2-LIN-NEXT: fldz -; X86-SSE2-LIN-NEXT: fxch %st(1) -; X86-SSE2-LIN-NEXT: fcmovnbe %st(1), %st +; X86-SSE2-LIN-NEXT: fcmovbe %st(1), %st ; X86-SSE2-LIN-NEXT: fstp %st(1) ; X86-SSE2-LIN-NEXT: fsubrp %st, %st(1) ; X86-SSE2-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) @@ -1116,8 +1102,7 @@ ; X64-SSE2-WIN-NEXT: fucomi %st(1), %st ; X64-SSE2-WIN-NEXT: setbe %al ; X64-SSE2-WIN-NEXT: fldz -; X64-SSE2-WIN-NEXT: fxch %st(1) -; X64-SSE2-WIN-NEXT: fcmovnbe %st(1), %st +; X64-SSE2-WIN-NEXT: fcmovbe %st(1), %st ; X64-SSE2-WIN-NEXT: fstp %st(1) ; X64-SSE2-WIN-NEXT: fsubrp %st, %st(1) ; X64-SSE2-WIN-NEXT: fnstcw {{[0-9]+}}(%rsp) @@ -1140,8 +1125,7 @@ ; X64-SSE2-LIN-NEXT: fucomi %st(1), %st ; X64-SSE2-LIN-NEXT: setbe %al ; X64-SSE2-LIN-NEXT: fldz -; X64-SSE2-LIN-NEXT: fxch %st(1) -; X64-SSE2-LIN-NEXT: fcmovnbe %st(1), %st +; X64-SSE2-LIN-NEXT: fcmovbe %st(1), %st ; X64-SSE2-LIN-NEXT: fstp %st(1) ; X64-SSE2-LIN-NEXT: fsubrp %st, %st(1) ; X64-SSE2-LIN-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -1170,13 +1154,12 @@ ; X87-WIN-NEXT: sahf ; X87-WIN-NEXT: setbe %al ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: ja LBB4_2 +; X87-WIN-NEXT: jbe LBB4_2 ; X87-WIN-NEXT: # %bb.1: -; X87-WIN-NEXT: fstp %st(0) +; X87-WIN-NEXT: fstp %st(1) ; X87-WIN-NEXT: fldz -; X87-WIN-NEXT: fxch %st(1) ; X87-WIN-NEXT: LBB4_2: -; X87-WIN-NEXT: fstp %st(1) +; X87-WIN-NEXT: fstp %st(0) ; X87-WIN-NEXT: fsubrp %st, %st(1) ; X87-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx @@ -1205,13 +1188,12 @@ ; X87-LIN-NEXT: sahf ; X87-LIN-NEXT: setbe %al ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: ja .LBB4_2 +; X87-LIN-NEXT: jbe .LBB4_2 ; X87-LIN-NEXT: # %bb.1: -; X87-LIN-NEXT: fstp %st(0) +; X87-LIN-NEXT: fstp %st(1) ; X87-LIN-NEXT: fldz -; X87-LIN-NEXT: fxch %st(1) ; X87-LIN-NEXT: .LBB4_2: -; X87-LIN-NEXT: fstp %st(1) +; X87-LIN-NEXT: fstp %st(0) ; X87-LIN-NEXT: fsubrp %st, %st(1) ; X87-LIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X87-LIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -222,17 +222,16 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: xorpd %xmm1, %xmm1 -; SSE-32-NEXT: xorpd %xmm3, %xmm3 -; SSE-32-NEXT: jb .LBB1_2 +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: movapd %xmm1, %xmm2 +; SSE-32-NEXT: jae .LBB1_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: xorpd %xmm2, %xmm2 ; SSE-32-NEXT: .LBB1_2: -; SSE-32-NEXT: movapd %xmm0, %xmm4 -; SSE-32-NEXT: subsd %xmm3, %xmm4 -; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: subsd %xmm2, %xmm3 +; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait @@ -244,10 +243,10 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: jb .LBB1_4 +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: jae .LBB1_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: xorpd %xmm1, %xmm1 ; SSE-32-NEXT: .LBB1_4: ; SSE-32-NEXT: subsd %xmm1, %xmm0 ; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) @@ -323,17 +322,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB1_2 +; AVX-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB1_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB1_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, (%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovsd %xmm2, (%esp) ; AVX-32-NEXT: fldl (%esp) ; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait @@ -342,11 +340,11 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB1_4 +; AVX-32-NEXT: jae .LBB1_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB1_4: -; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) @@ -410,30 +408,25 @@ ; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomisd %xmm2, %xmm1 -; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovapd %xmm2, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovsd %xmm1, (%esp) -; AVX512F-32-NEXT: fldl (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) -; AVX512F-32-NEXT: wait ; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovsd %xmm1, (%esp) ; AVX512F-32-NEXT: xorl %ecx, %ecx ; AVX512F-32-NEXT: vcomisd %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fldl (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: shll $31, %eax +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: shll $31, %ecx ; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -468,30 +461,25 @@ ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm1 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovapd %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp) ; AVX512VL-32-NEXT: xorl %ecx, %ecx ; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -905,17 +893,16 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 -; SSE-32-NEXT: jb .LBB4_2 +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: movaps %xmm1, %xmm2 +; SSE-32-NEXT: jae .LBB4_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: xorps %xmm2, %xmm2 ; SSE-32-NEXT: .LBB4_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm2, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait @@ -927,10 +914,10 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: jb .LBB4_4 +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: jae .LBB4_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: xorps %xmm1, %xmm1 ; SSE-32-NEXT: .LBB4_4: ; SSE-32-NEXT: subss %xmm1, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) @@ -1006,17 +993,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB4_2 +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB4_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB4_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -1025,11 +1011,11 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB4_4 +; AVX-32-NEXT: jae .LBB4_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB4_4: -; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovss %xmm0, (%esp) ; AVX-32-NEXT: flds (%esp) ; AVX-32-NEXT: fisttpll (%esp) @@ -1093,30 +1079,25 @@ ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 -; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait ; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ecx, %ecx ; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovss %xmm0, (%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds (%esp) ; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: shll $31, %eax +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: shll $31, %ecx ; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1151,30 +1132,25 @@ ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ecx, %ecx ; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds (%esp) ; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1225,17 +1201,16 @@ ; SSE-32-NEXT: subl $24, %esp ; SSE-32-NEXT: movl 8(%ebp), %eax ; SSE-32-NEXT: movaps (%eax), %xmm0 -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 -; SSE-32-NEXT: jb .LBB5_2 +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: movaps %xmm1, %xmm2 +; SSE-32-NEXT: jae .LBB5_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: xorps %xmm2, %xmm2 ; SSE-32-NEXT: .LBB5_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm2, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait @@ -1247,10 +1222,10 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: jb .LBB5_4 +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: jae .LBB5_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: xorps %xmm1, %xmm1 ; SSE-32-NEXT: .LBB5_4: ; SSE-32-NEXT: subss %xmm1, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) @@ -1328,17 +1303,16 @@ ; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: movl 8(%ebp), %eax ; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB5_2 +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB5_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB5_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -1347,11 +1321,11 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB5_4 +; AVX-32-NEXT: jae .LBB5_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB5_4: -; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovss %xmm0, (%esp) ; AVX-32-NEXT: flds (%esp) ; AVX-32-NEXT: fisttpll (%esp) @@ -1418,30 +1392,25 @@ ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 -; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait ; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ecx, %ecx ; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovss %xmm0, (%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds (%esp) ; AVX512F-32-NEXT: fisttpll (%esp) ; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: shll $31, %eax +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: shll $31, %ecx ; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1477,30 +1446,25 @@ ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ecx, %ecx ; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds (%esp) ; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2416,17 +2380,16 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: xorpd %xmm1, %xmm1 -; SSE-32-NEXT: xorpd %xmm3, %xmm3 -; SSE-32-NEXT: jb .LBB19_2 +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: movapd %xmm1, %xmm2 +; SSE-32-NEXT: jae .LBB19_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: xorpd %xmm2, %xmm2 ; SSE-32-NEXT: .LBB19_2: -; SSE-32-NEXT: movapd %xmm0, %xmm4 -; SSE-32-NEXT: subsd %xmm3, %xmm4 -; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: subsd %xmm2, %xmm3 +; SSE-32-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait @@ -2438,10 +2401,10 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: comisd %xmm2, %xmm0 -; SSE-32-NEXT: jb .LBB19_4 +; SSE-32-NEXT: comisd %xmm1, %xmm0 +; SSE-32-NEXT: jae .LBB19_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: xorpd %xmm1, %xmm1 ; SSE-32-NEXT: .LBB19_4: ; SSE-32-NEXT: subsd %xmm1, %xmm0 ; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) @@ -2517,17 +2480,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB19_2 +; AVX-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB19_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB19_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, (%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovsd %xmm2, (%esp) ; AVX-32-NEXT: fldl (%esp) ; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait @@ -2536,11 +2498,11 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB19_4 +; AVX-32-NEXT: jae .LBB19_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB19_4: -; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) @@ -2792,17 +2754,16 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: xorps %xmm3, %xmm3 -; SSE-32-NEXT: jb .LBB21_2 +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: movaps %xmm1, %xmm2 +; SSE-32-NEXT: jae .LBB21_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: xorps %xmm2, %xmm2 ; SSE-32-NEXT: .LBB21_2: -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm3, %xmm4 -; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm2, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: wait @@ -2814,10 +2775,10 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-32-NEXT: comiss %xmm2, %xmm0 -; SSE-32-NEXT: jb .LBB21_4 +; SSE-32-NEXT: comiss %xmm1, %xmm0 +; SSE-32-NEXT: jae .LBB21_4 ; SSE-32-NEXT: # %bb.3: -; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: xorps %xmm1, %xmm1 ; SSE-32-NEXT: .LBB21_4: ; SSE-32-NEXT: subss %xmm1, %xmm0 ; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) @@ -2893,17 +2854,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB21_2 +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB21_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB21_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -2912,11 +2872,11 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB21_4 +; AVX-32-NEXT: jae .LBB21_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB21_4: -; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovss %xmm0, (%esp) ; AVX-32-NEXT: flds (%esp) ; AVX-32-NEXT: fisttpll (%esp) diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -226,17 +226,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB1_2 +; AVX-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB1_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB1_2: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -244,16 +243,16 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX-32-NEXT: vxorpd %xmm5, %xmm5, %xmm5 -; AVX-32-NEXT: jb .LBB1_4 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: jae .LBB1_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: .LBB1_4: -; AVX-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX-32-NEXT: vmovsd %xmm4, (%esp) +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, (%esp) ; AVX-32-NEXT: fldl (%esp) ; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait @@ -261,14 +260,14 @@ ; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB1_6 +; AVX-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX-32-NEXT: vmovapd %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB1_6 ; AVX-32-NEXT: # %bb.5: -; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB1_6: -; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -277,11 +276,11 @@ ; AVX-32-NEXT: shll $31, %edx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB1_8 +; AVX-32-NEXT: jae .LBB1_8 ; AVX-32-NEXT: # %bb.7: -; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB1_8: -; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) @@ -375,83 +374,68 @@ ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512F-32-NEXT: pushl %ebx -; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $32, %esp -; AVX512F-32-NEXT: .cfi_offset %esi, -16 +; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: movl $0, %eax +; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512F-32-NEXT: xorl %eax, %eax +; AVX512F-32-NEXT: vcomisd %xmm3, %xmm2 ; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl %eax, %esi -; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1} -; AVX512F-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512F-32-NEXT: fldl (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512F-32-NEXT: setb %bl -; AVX512F-32-NEXT: kmovw %ebx, %k1 -; AVX512F-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z} ; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovsd %xmm2, (%esp) +; AVX512F-32-NEXT: xorl %edx, %edx +; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1 ; AVX512F-32-NEXT: setae %dl -; AVX512F-32-NEXT: shll $31, %edx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovw %edx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-32-NEXT: xorl %ecx, %ecx +; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ebx, %ebx -; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512F-32-NEXT: setb %al -; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: vcomisd %xmm3, %xmm0 +; AVX512F-32-NEXT: setae %bl +; AVX512F-32-NEXT: kmovw %ebx, %k1 +; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z} ; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fldl (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) +; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: shll $31, %eax +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: shll $31, %edx +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512F-32-NEXT: setae %bl +; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512F-32-NEXT: shll $31, %ecx +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: shll $31, %ebx ; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-32-NEXT: leal -8(%ebp), %esp -; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: leal -4(%ebp), %esp ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 @@ -483,83 +467,68 @@ ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512VL-32-NEXT: pushl %ebx -; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $32, %esp -; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm2 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setb %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovsd %xmm2, (%esp) +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1 ; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm0 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: shll $31, %ebx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-32-NEXT: leal -8(%ebp), %esp -; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: leal -4(%ebp), %esp ; AVX512VL-32-NEXT: popl %ebx ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 @@ -788,17 +757,16 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB3_2 +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB3_2 ; AVX-32-NEXT: # %bb.1: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB3_2: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -806,15 +774,15 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB3_4 +; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB3_4 ; AVX-32-NEXT: # %bb.3: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB3_4: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, (%esp) ; AVX-32-NEXT: flds (%esp) ; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: wait @@ -822,15 +790,15 @@ ; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-32-NEXT: jb .LBB3_6 +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX-32-NEXT: vmovaps %xmm1, %xmm3 +; AVX-32-NEXT: jae .LBB3_6 ; AVX-32-NEXT: # %bb.5: -; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-32-NEXT: .LBB3_6: -; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX-32-NEXT: wait @@ -839,11 +807,11 @@ ; AVX-32-NEXT: shll $31, %edx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX-32-NEXT: jb .LBB3_8 +; AVX-32-NEXT: jae .LBB3_8 ; AVX-32-NEXT: # %bb.7: -; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-32-NEXT: .LBB3_8: -; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) @@ -937,83 +905,68 @@ ; AVX512F-32-NEXT: movl %esp, %ebp ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512F-32-NEXT: pushl %ebx -; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: andl $-8, %esp -; AVX512F-32-NEXT: subl $32, %esp -; AVX512F-32-NEXT: .cfi_offset %esi, -16 +; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512F-32-NEXT: setb %cl -; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: movl $0, %eax +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: xorl %eax, %eax +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 ; AVX512F-32-NEXT: setae %al -; AVX512F-32-NEXT: shll $31, %eax -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: movl %eax, %esi -; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512F-32-NEXT: setb %dl -; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, (%esp) -; AVX512F-32-NEXT: flds (%esp) -; AVX512F-32-NEXT: fisttpll (%esp) -; AVX512F-32-NEXT: wait -; AVX512F-32-NEXT: setae %cl -; AVX512F-32-NEXT: shll $31, %ecx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovss %xmm1, (%esp) +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512F-32-NEXT: setb %bl -; AVX512F-32-NEXT: kmovw %ebx, %k1 -; AVX512F-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512F-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 ; AVX512F-32-NEXT: setae %dl -; AVX512F-32-NEXT: shll $31, %edx -; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512F-32-NEXT: kmovw %edx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-32-NEXT: xorl %ecx, %ecx +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: setae %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ebx, %ebx -; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512F-32-NEXT: setb %al -; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} +; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512F-32-NEXT: setae %bl +; AVX512F-32-NEXT: kmovw %ebx, %k1 +; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} ; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: flds (%esp) +; AVX512F-32-NEXT: fisttpll (%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: shll $31, %eax +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: shll $31, %edx +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512F-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512F-32-NEXT: setae %bl +; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512F-32-NEXT: shll $31, %ecx +; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: shll $31, %ebx ; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512F-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-32-NEXT: leal -8(%ebp), %esp -; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: leal -4(%ebp), %esp ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 @@ -1045,83 +998,68 @@ ; AVX512VL-32-NEXT: movl %esp, %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512VL-32-NEXT: pushl %ebx -; AVX512VL-32-NEXT: pushl %esi ; AVX512VL-32-NEXT: andl $-8, %esp -; AVX512VL-32-NEXT: subl $32, %esp -; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setb %dl -; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, (%esp) -; AVX512VL-32-NEXT: flds (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %cl -; AVX512VL-32-NEXT: shll $31, %ecx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, (%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setb %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 ; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: shll $31, %ebx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-32-NEXT: leal -8(%ebp), %esp -; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: leal -4(%ebp), %esp ; AVX512VL-32-NEXT: popl %ebx ; AVX512VL-32-NEXT: popl %ebp ; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -149,147 +149,125 @@ ; AVX512VL-32-NEXT: .cfi_offset %esi, -20 ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp) +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %edi -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) -; AVX512VL-32-NEXT: fldl (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: setae %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %esi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm2 +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -463,147 +441,125 @@ ; AVX512VL-32-NEXT: .cfi_offset %esi, -20 ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %edi -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: movl %eax, %esi -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,3,3,3] -; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 -; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, (%esp) +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds (%esp) ; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %dl -; AVX512VL-32-NEXT: shll $31, %edx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setb %cl -; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: wait -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: shll $31, %eax -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 -; AVX512VL-32-NEXT: setb %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %esi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload -; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512VL-32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm2 +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0