Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1162,10 +1162,30 @@ // we convert them to normal FP opcodes instead at this point. This // will allow them to be handled by existing target-specific instruction // selectors. - if (!TLI->isStrictFPEnabled() && Node->isStrictFPOpcode() && - (TLI->getOperationAction(Node->getOpcode(), Node->getValueType(0)) - == TargetLowering::Expand)) - Node = CurDAG->mutateStrictFPToFP(Node); + if (!TLI->isStrictFPEnabled() && Node->isStrictFPOpcode()) { + // For some opcodes, we need to call TLI->getOperationAction using + // the first operand type instead of the result type. Note that this + // must match what SelectionDAGLegalize::LegalizeOp is doing. + EVT ActionVT; + switch (Node->getOpcode()) { + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: + ActionVT = Node->getOperand(1).getValueType(); + break; + default: + ActionVT = Node->getValueType(0); + break; + } + if (TLI->getOperationAction(Node->getOpcode(), ActionVT) + == TargetLowering::Expand) + Node = CurDAG->mutateStrictFPToFP(Node); + } LLVM_DEBUG(dbgs() << "\nISEL: Starting selection on root node: "; Node->dump(CurDAG)); Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6074,7 +6074,13 @@ } SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT); - SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT); + SDValue Sel; + + if (Node->isStrictFPOpcode()) + Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, + Node->getOperand(0), /*IsSignaling*/ true); + else + Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT); bool Strict = Node->isStrictFPOpcode() || shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false); @@ -6149,13 +6155,16 @@ // For unsigned conversions, convert them to signed conversions using the // algorithm from the x86_64 __floatundidf in compiler_rt. - SDValue Fast; - if (Node->isStrictFPOpcode()) { - Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, - {Node->getOperand(0), Src}); - Chain = SDValue(Fast.getNode(), 1); - } else - Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); + + // TODO: This really should be implemented using a branch rather than a + // select. We happen to get lucky and machinesink does the right + // thing most of the time. This would be a good candidate for a + // pseudo-op, or, even better, for whole-function isel. + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); + + SDValue SignBitTest = DAG.getSetCC( + dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT); SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst); @@ -6163,27 +6172,28 @@ SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst); SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr); - SDValue Slow; + SDValue Slow, Fast; if (Node->isStrictFPOpcode()) { - SDValue SignCvt = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, - {DstVT, MVT::Other}, {Chain, Or}); + // In strict mode, we must avoid spurious exceptions, and therefore + // must make sure to only emit a single STRICT_SINT_TO_FP. + SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Src); + Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DstVT, MVT::Other }, + { Node->getOperand(0), InCvt }); Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DstVT, MVT::Other }, - { SignCvt.getValue(1), SignCvt, SignCvt }); + { Fast.getValue(1), Fast, Fast }); Chain = Slow.getValue(1); + // The STRICT_SINT_TO_FP inherits the exception mode from the + // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can + // never raise any exception. + SDNodeFlags Flags; + Flags.setFPExcept(Node->getFlags().hasFPExcept()); + Fast->setFlags(Flags); } else { SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } - // TODO: This really should be implemented using a branch rather than a - // select. We happen to get lucky and machinesink does the right - // thing most of the time. This would be a good candidate for a - // pseudo-op, or, even better, for whole-function isel. - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); - - SDValue SignBitTest = DAG.getSetCC( - dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast); return true; } Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5246,13 +5246,6 @@ SelectCode(Res.getNode()); return; } - case ISD::STRICT_FP_TO_SINT: - case ISD::STRICT_FP_TO_UINT: - // FIXME: Remove when we have isel patterns for strict versions of these - // nodes. - if (!TLI->isStrictFPEnabled()) - CurDAG->mutateStrictFPToFP(Node); - break; } SelectCode(Node); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19199,10 +19199,15 @@ SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); - SDValue Cmp = DAG.getSetCC(DL, - getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT), - Value, ThreshVal, ISD::SETLT); + EVT ResVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT); + SDValue Cmp; + if (IsStrict) + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, + Chain, /*IsSignaling*/ true); + else + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(APInt::getSignMask(64), Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -7102,22 +7102,22 @@ def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; -def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), +def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), +def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), +def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), +def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (sint_to_fp GR32:$src)), +def : Pat<(f32 (any_sint_to_fp GR32:$src)), (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f32 (sint_to_fp GR64:$src)), +def : Pat<(f32 (any_sint_to_fp GR64:$src)), (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; -def : Pat<(f64 (sint_to_fp GR32:$src)), +def : Pat<(f64 (any_sint_to_fp GR32:$src)), (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f64 (sint_to_fp GR64:$src)), +def : Pat<(f64 (any_sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, @@ -7141,22 +7141,22 @@ def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; -def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), +def : Pat<(f32 (any_uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), +def : Pat<(f32 (any_uint_to_fp (loadi64 addr:$src))), (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))), +def : Pat<(f64 (any_uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))), +def : Pat<(f64 (any_uint_to_fp (loadi64 addr:$src))), (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (uint_to_fp GR32:$src)), +def : Pat<(f32 (any_uint_to_fp GR32:$src)), (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f32 (uint_to_fp GR64:$src)), +def : Pat<(f32 (any_uint_to_fp GR64:$src)), (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; -def : Pat<(f64 (uint_to_fp GR32:$src)), +def : Pat<(f64 (any_uint_to_fp GR32:$src)), (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f64 (uint_to_fp GR64:$src)), +def : Pat<(f64 (any_uint_to_fp GR64:$src)), (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } @@ -7227,82 +7227,82 @@ let Predicates = [HasAVX512] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR64:$src)))))), (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi64 addr:$src))))))), (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR32:$src)))))), (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi32 addr:$src))))))), (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR64:$src)))))), (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi64 addr:$src))))))), (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR32:$src)))))), (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi32 addr:$src))))))), (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>; } // Predicates = [HasAVX512] @@ -8108,10 +8108,10 @@ VK4WM:$mask, i64mem:$src), 0, "att">; } -defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86VSintToFP, SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, PS, EVEX_CD8<32, CD8VF>; @@ -8131,11 +8131,11 @@ X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, PS, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp, X86VUintToFP, SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, +defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>; @@ -8187,19 +8187,19 @@ X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; -defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, +defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, +defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, EVEX_CD8<64, CD8VF>; @@ -8383,17 +8383,17 @@ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; -def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), +def : Pat<(v8f32 (any_uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), +def : Pat<(v4f32 (any_uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), +def : Pat<(v4f64 (any_uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_ymm)>; @@ -8519,32 +8519,32 @@ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))), +def : Pat<(v4f32 (any_sint_to_fp (v4i64 VR256X:$src1))), (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; -def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))), +def : Pat<(v2f64 (any_sint_to_fp (v2i64 VR128X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))), +def : Pat<(v4f64 (any_sint_to_fp (v4i64 VR256X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; -def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))), +def : Pat<(v4f32 (any_uint_to_fp (v4i64 VR256X:$src1))), (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; -def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))), +def : Pat<(v2f64 (any_uint_to_fp (v2i64 VR128X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)))), sub_xmm)>; -def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), +def : Pat<(v4f64 (any_uint_to_fp (v4i64 VR256X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>; Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -842,11 +842,11 @@ string asm, Domain d, X86FoldableSchedWrite sched> { let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : I, + [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, Sched<[sched]>; let mayLoad = 1 in def rm : I, Sched<[sched.Folded]>; } @@ -906,22 +906,22 @@ } // isCodeGenOnly = 1 let Predicates = [UseAVX] in { - def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f32 (sint_to_fp GR32:$src)), + def : Pat<(f32 (any_sint_to_fp GR32:$src)), (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; - def : Pat<(f32 (sint_to_fp GR64:$src)), + def : Pat<(f32 (any_sint_to_fp GR64:$src)), (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; - def : Pat<(f64 (sint_to_fp GR32:$src)), + def : Pat<(f64 (any_sint_to_fp GR32:$src)), (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; - def : Pat<(f64 (sint_to_fp GR64:$src)), + def : Pat<(f64 (any_sint_to_fp GR64:$src)), (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } @@ -938,16 +938,16 @@ defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; -defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, "cvtsi2ss", "cvtsi2ss{l}", WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; -defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, +defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, "cvtsi2ss", "cvtsi2ss{q}", WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; -defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, "cvtsi2sd", "cvtsi2sd{l}", WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; -defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, +defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, "cvtsi2sd", "cvtsi2sd{q}", WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; } // isCodeGenOnly = 1 @@ -1346,42 +1346,42 @@ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseAVX] @@ -1400,44 +1400,44 @@ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseSSE2] let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseSSE1] @@ -1663,13 +1663,13 @@ def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>, + (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, VEX_WIG; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, + (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; } Index: llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll =================================================================== --- llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll +++ llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll @@ -19,7 +19,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI0_0 ; CHECK-NEXT: le %f1, 0(%r1) -; CHECK-NEXT: cebr %f0, %f1 +; CHECK-NEXT: kebr %f0, %f1 ; CHECK-NEXT: jnl .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 @@ -43,7 +43,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: ldeb %f1, 0(%r1) -; CHECK-NEXT: cdbr %f0, %f1 +; CHECK-NEXT: kdbr %f0, %f1 ; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 @@ -69,7 +69,7 @@ ; CHECK-NEXT: ld %f2, 8(%r2) ; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) -; CHECK-NEXT: cxbr %f0, %f1 +; CHECK-NEXT: kxbr %f0, %f1 ; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lhi %r0, 0 Index: llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll =================================================================== --- llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll +++ llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll @@ -18,7 +18,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI0_0 ; CHECK-NEXT: le %f1, 0(%r1) -; CHECK-NEXT: cebr %f0, %f1 +; CHECK-NEXT: kebr %f0, %f1 ; CHECK-NEXT: jnl .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 @@ -42,7 +42,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI1_0 ; CHECK-NEXT: ldeb %f1, 0(%r1) -; CHECK-NEXT: cdbr %f0, %f1 +; CHECK-NEXT: kdbr %f0, %f1 ; CHECK-NEXT: jnl .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 @@ -68,7 +68,7 @@ ; CHECK-NEXT: ld %f2, 8(%r2) ; CHECK-NEXT: larl %r1, .LCPI2_0 ; CHECK-NEXT: lxeb %f1, 0(%r1) -; CHECK-NEXT: cxbr %f0, %f1 +; CHECK-NEXT: kxbr %f0, %f1 ; CHECK-NEXT: jnl .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lghi %r0, 0 Index: llvm/test/CodeGen/X86/fp-intrinsics-flags.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics-flags.ll +++ llvm/test/CodeGen/X86/fp-intrinsics-flags.ll @@ -29,13 +29,13 @@ ; CHECK-LABEL: name: f20u64 ; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.0, align 16) ; CHECK: [[MOVSDrm_alt1:%[0-9]+]]:fr64 = MOVSDrm_alt $noreg, 1, $noreg, %const.0, $noreg :: (load 8 from constant-pool) -; CHECK: [[CMPSDrr:%[0-9]+]]:fr64 = CMPSDrr [[MOVSDrm_alt]], [[MOVSDrm_alt1]], 1, implicit $mxcsr -; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY [[CMPSDrr]] -; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY [[MOVSDrm_alt1]] -; CHECK: [[PANDNrr:%[0-9]+]]:vr128 = PANDNrr [[COPY]], killed [[COPY1]] -; CHECK: [[COPY2:%[0-9]+]]:fr64 = COPY [[PANDNrr]] -; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[COPY2]], implicit $mxcsr +; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr +; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD +; CHECK: JCC_1 +; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}} +; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr ; CHECK: MOVSDmr %stack.0, 1, $noreg, 0, $noreg, killed [[SUBSDrr]] :: (store 8 into %stack.0) +; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags ; CHECK: [[LD_Fp64m:%[0-9]+]]:rfp64 = LD_Fp64m %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load 8 from %stack.0) ; CHECK: FNSTCW16m %stack.1, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit $fpcw :: (store 2 into %stack.1) ; CHECK: [[MOVZX32rm16_:%[0-9]+]]:gr32 = MOVZX32rm16 %stack.1, 1, $noreg, 0, $noreg :: (load 2 from %stack.1) @@ -45,8 +45,6 @@ ; CHECK: FLDCW16m %stack.2, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit-def $fpcw :: (load 2 from %stack.2) ; CHECK: IST_Fp64m64 %stack.0, 1, $noreg, 0, $noreg, [[LD_Fp64m]], implicit-def $fpsw, implicit $fpcw ; CHECK: FLDCW16m %stack.1, 1, $noreg, 0, $noreg, implicit-def $fpsw, implicit-def $fpcw :: (load 2 from %stack.1) -; CHECK: UCOMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr -; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags ; CHECK: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 killed [[SETCCr]] ; CHECK: [[SHL32ri:%[0-9]+]]:gr32 = SHL32ri [[MOVZX32rr8_]], 31, implicit-def dead $eflags ; CHECK: [[XOR32rm:%[0-9]+]]:gr32 = XOR32rm [[SHL32ri]], %stack.0, 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load 4 from %stack.0 + 4) @@ -86,16 +84,14 @@ ; CHECK-LABEL: name: f20u ; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.0, align 16) ; CHECK: [[MOVSDrm_alt1:%[0-9]+]]:fr64 = MOVSDrm_alt $noreg, 1, $noreg, %const.0, $noreg :: (load 8 from constant-pool) -; CHECK: UCOMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr +; CHECK: COMISDrr [[MOVSDrm_alt1]], [[MOVSDrm_alt]], implicit-def $eflags, implicit $mxcsr +; CHECK: [[FsFLD0SD:%[0-9]+]]:fr64 = FsFLD0SD +; CHECK: JCC_1 +; CHECK: [[PHI:%[0-9]+]]:fr64 = PHI [[MOVSDrm_alt1]], {{.*}}, [[FsFLD0SD]], {{.*}} ; CHECK: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 6, implicit $eflags ; CHECK: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 killed [[SETCCr]] ; CHECK: [[SHL32ri:%[0-9]+]]:gr32 = SHL32ri [[MOVZX32rr8_]], 31, implicit-def dead $eflags -; CHECK: [[CMPSDrr:%[0-9]+]]:fr64 = CMPSDrr [[MOVSDrm_alt]], [[MOVSDrm_alt1]], 1, implicit $mxcsr -; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY [[CMPSDrr]] -; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY [[MOVSDrm_alt1]] -; CHECK: [[PANDNrr:%[0-9]+]]:vr128 = PANDNrr [[COPY]], killed [[COPY1]] -; CHECK: [[COPY2:%[0-9]+]]:fr64 = COPY [[PANDNrr]] -; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[COPY2]], implicit $mxcsr +; CHECK: [[SUBSDrr:%[0-9]+]]:fr64 = SUBSDrr [[MOVSDrm_alt]], killed [[PHI]], implicit $mxcsr ; CHECK: [[CVTTSD2SIrr:%[0-9]+]]:gr32 = CVTTSD2SIrr killed [[SUBSDrr]], implicit $mxcsr ; CHECK: [[XOR32rr:%[0-9]+]]:gr32 = XOR32rr [[CVTTSD2SIrr]], killed [[SHL32ri]], implicit-def dead $eflags ; CHECK: $eax = COPY [[XOR32rr]] Index: llvm/test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics.ll +++ llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1277,15 +1277,17 @@ ; X86-SSE-LABEL: f20u: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: xorl %ecx, %ecx -; X86-SSE-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE-NEXT: setbe %cl +; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE-NEXT: comisd %xmm0, %xmm2 +; X86-SSE-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE-NEXT: ja .LBB24_2 +; X86-SSE-NEXT: # %bb.1: # %entry +; X86-SSE-NEXT: movapd %xmm2, %xmm1 +; X86-SSE-NEXT: .LBB24_2: # %entry +; X86-SSE-NEXT: setbe %al +; X86-SSE-NEXT: movzbl %al, %ecx ; X86-SSE-NEXT: shll $31, %ecx -; X86-SSE-NEXT: movapd %xmm0, %xmm2 -; X86-SSE-NEXT: cmpltsd %xmm1, %xmm2 -; X86-SSE-NEXT: andnpd %xmm1, %xmm2 -; X86-SSE-NEXT: subsd %xmm2, %xmm0 +; X86-SSE-NEXT: subsd %xmm1, %xmm0 ; X86-SSE-NEXT: cvttsd2si %xmm0, %eax ; X86-SSE-NEXT: xorl %ecx, %eax ; X86-SSE-NEXT: retl @@ -1324,7 +1326,7 @@ ; X87-NEXT: fldl {{[0-9]+}}(%esp) ; X87-NEXT: flds {{\.LCPI.*}} ; X87-NEXT: xorl %edx, %edx -; X87-NEXT: fucomi %st(1), %st +; X87-NEXT: fcomi %st(1), %st ; X87-NEXT: setbe %dl ; X87-NEXT: fldz ; X87-NEXT: fxch %st(1) @@ -1350,24 +1352,25 @@ ; X86-SSE-NEXT: subl $20, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 24 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: movapd %xmm0, %xmm2 -; X86-SSE-NEXT: cmpltsd %xmm1, %xmm2 -; X86-SSE-NEXT: andnpd %xmm1, %xmm2 -; X86-SSE-NEXT: movapd %xmm0, %xmm3 -; X86-SSE-NEXT: subsd %xmm2, %xmm3 -; X86-SSE-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE-NEXT: comisd %xmm0, %xmm2 +; X86-SSE-NEXT: xorpd %xmm1, %xmm1 +; X86-SSE-NEXT: ja .LBB25_2 +; X86-SSE-NEXT: # %bb.1: # %entry +; X86-SSE-NEXT: movapd %xmm2, %xmm1 +; X86-SSE-NEXT: .LBB25_2: # %entry +; X86-SSE-NEXT: subsd %xmm1, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: setbe %al ; X86-SSE-NEXT: fldl {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fnstcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; X86-SSE-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-SSE-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-SSE-NEXT: fldcw {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE-NEXT: setbe %dl +; X86-SSE-NEXT: movzbl %al, %edx ; X86-SSE-NEXT: shll $31, %edx ; X86-SSE-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1377,30 +1380,35 @@ ; ; SSE-LABEL: f20u64: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: ucomisd %xmm1, %xmm0 -; SSE-NEXT: setae %cl -; SSE-NEXT: shlq $63, %rcx -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpltsd %xmm1, %xmm2 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: subsd %xmm2, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: comisd %xmm2, %xmm0 +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: jb .LBB25_2 +; SSE-NEXT: # %bb.1: # %entry +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: .LBB25_2: # %entry +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: setae %al +; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: shlq $63, %rax ; SSE-NEXT: xorq %rcx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: f20u64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-NEXT: setae %cl -; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vandnpd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vcomisd %xmm1, %xmm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jb .LBB25_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-NEXT: .LBB25_2: # %entry +; AVX1-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: setae %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax ; AVX1-NEXT: xorq %rcx, %rax ; AVX1-NEXT: retq ; @@ -2656,34 +2664,34 @@ ; ; SSE-LABEL: uiffl: ; SSE: # %bb.0: # %entry -; SSE-NEXT: testq %rdi, %rdi -; SSE-NEXT: js .LBB52_1 -; SSE-NEXT: # %bb.2: # %entry -; SSE-NEXT: cvtsi2ss %rdi, %xmm0 -; SSE-NEXT: retq -; SSE-NEXT: .LBB52_1: ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: shrq %rax -; SSE-NEXT: andl $1, %edi -; SSE-NEXT: orq %rax, %rdi -; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: movl %edi, %ecx +; SSE-NEXT: andl $1, %ecx +; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnsq %rdi, %rcx +; SSE-NEXT: cvtsi2ss %rcx, %xmm0 +; SSE-NEXT: jns .LBB52_2 +; SSE-NEXT: # %bb.1: ; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: .LBB52_2: # %entry ; SSE-NEXT: retq ; ; AVX1-LABEL: uiffl: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: testq %rdi, %rdi -; AVX1-NEXT: js .LBB52_1 -; AVX1-NEXT: # %bb.2: # %entry -; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB52_1: ; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: andl $1, %edi -; AVX1-NEXT: orq %rax, %rdi -; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: cmovnsq %rdi, %rcx +; AVX1-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; AVX1-NEXT: jns .LBB52_2 +; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB52_2: # %entry ; AVX1-NEXT: retq ; ; AVX512-LABEL: uiffl: Index: llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll =================================================================== --- llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll +++ llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -437,15 +437,17 @@ ; SSE-X86-LABEL: fptoui_f32toi32: ; SSE-X86: # %bb.0: ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-X86-NEXT: xorl %ecx, %ecx -; SSE-X86-NEXT: ucomiss %xmm0, %xmm1 -; SSE-X86-NEXT: setbe %cl +; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-X86-NEXT: comiss %xmm0, %xmm2 +; SSE-X86-NEXT: xorps %xmm1, %xmm1 +; SSE-X86-NEXT: ja .LBB8_2 +; SSE-X86-NEXT: # %bb.1: +; SSE-X86-NEXT: movaps %xmm2, %xmm1 +; SSE-X86-NEXT: .LBB8_2: +; SSE-X86-NEXT: setbe %al +; SSE-X86-NEXT: movzbl %al, %ecx ; SSE-X86-NEXT: shll $31, %ecx -; SSE-X86-NEXT: movaps %xmm0, %xmm2 -; SSE-X86-NEXT: cmpltss %xmm1, %xmm2 -; SSE-X86-NEXT: andnps %xmm1, %xmm2 -; SSE-X86-NEXT: subss %xmm2, %xmm0 +; SSE-X86-NEXT: subss %xmm1, %xmm0 ; SSE-X86-NEXT: cvttss2si %xmm0, %eax ; SSE-X86-NEXT: xorl %ecx, %eax ; SSE-X86-NEXT: retl @@ -529,24 +531,25 @@ ; SSE-X86-NEXT: andl $-8, %esp ; SSE-X86-NEXT: subl $16, %esp ; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-X86-NEXT: movaps %xmm0, %xmm2 -; SSE-X86-NEXT: cmpltss %xmm1, %xmm2 -; SSE-X86-NEXT: andnps %xmm1, %xmm2 -; SSE-X86-NEXT: movaps %xmm0, %xmm3 -; SSE-X86-NEXT: subss %xmm2, %xmm3 -; SSE-X86-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-X86-NEXT: comiss %xmm0, %xmm2 +; SSE-X86-NEXT: xorps %xmm1, %xmm1 +; SSE-X86-NEXT: ja .LBB9_2 +; SSE-X86-NEXT: # %bb.1: +; SSE-X86-NEXT: movaps %xmm2, %xmm1 +; SSE-X86-NEXT: .LBB9_2: +; SSE-X86-NEXT: subss %xmm1, %xmm0 +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: setbe %al ; SSE-X86-NEXT: flds {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-X86-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-X86-NEXT: movw %cx, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: xorl %edx, %edx -; SSE-X86-NEXT: ucomiss %xmm0, %xmm1 -; SSE-X86-NEXT: setbe %dl +; SSE-X86-NEXT: movzbl %al, %edx ; SSE-X86-NEXT: shll $31, %edx ; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -557,16 +560,18 @@ ; ; SSE-X64-LABEL: fptoui_f32toi64: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-X64-NEXT: xorl %ecx, %ecx -; SSE-X64-NEXT: ucomiss %xmm1, %xmm0 -; SSE-X64-NEXT: setae %cl -; SSE-X64-NEXT: shlq $63, %rcx -; SSE-X64-NEXT: movaps %xmm0, %xmm2 -; SSE-X64-NEXT: cmpltss %xmm1, %xmm2 -; SSE-X64-NEXT: andnps %xmm1, %xmm2 -; SSE-X64-NEXT: subss %xmm2, %xmm0 -; SSE-X64-NEXT: cvttss2si %xmm0, %rax +; SSE-X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-X64-NEXT: comiss %xmm2, %xmm0 +; SSE-X64-NEXT: xorps %xmm1, %xmm1 +; SSE-X64-NEXT: jb .LBB9_2 +; SSE-X64-NEXT: # %bb.1: +; SSE-X64-NEXT: movaps %xmm2, %xmm1 +; SSE-X64-NEXT: .LBB9_2: +; SSE-X64-NEXT: subss %xmm1, %xmm0 +; SSE-X64-NEXT: cvttss2si %xmm0, %rcx +; SSE-X64-NEXT: setae %al +; SSE-X64-NEXT: movzbl %al, %eax +; SSE-X64-NEXT: shlq $63, %rax ; SSE-X64-NEXT: xorq %rcx, %rax ; SSE-X64-NEXT: retq ; @@ -581,15 +586,18 @@ ; AVX1-X86-NEXT: subl $8, %esp ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-X86-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 -; AVX1-X86-NEXT: vandnps %xmm1, %xmm2, %xmm2 -; AVX1-X86-NEXT: vsubss %xmm2, %xmm0, %xmm2 -; AVX1-X86-NEXT: vmovss %xmm2, (%esp) +; AVX1-X86-NEXT: vcomiss %xmm0, %xmm1 +; AVX1-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-X86-NEXT: ja .LBB9_2 +; AVX1-X86-NEXT: # %bb.1: +; AVX1-X86-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-X86-NEXT: .LBB9_2: +; AVX1-X86-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX1-X86-NEXT: vmovss %xmm0, (%esp) ; AVX1-X86-NEXT: flds (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) -; AVX1-X86-NEXT: xorl %edx, %edx -; AVX1-X86-NEXT: vucomiss %xmm0, %xmm1 -; AVX1-X86-NEXT: setbe %dl +; AVX1-X86-NEXT: setbe %al +; AVX1-X86-NEXT: movzbl %al, %edx ; AVX1-X86-NEXT: shll $31, %edx ; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX1-X86-NEXT: movl (%esp), %eax @@ -601,14 +609,17 @@ ; AVX1-X64-LABEL: fptoui_f32toi64: ; AVX1-X64: # %bb.0: ; AVX1-X64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-X64-NEXT: xorl %ecx, %ecx -; AVX1-X64-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-X64-NEXT: setae %cl -; AVX1-X64-NEXT: shlq $63, %rcx -; AVX1-X64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 -; AVX1-X64-NEXT: vandnps %xmm1, %xmm2, %xmm1 -; AVX1-X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX1-X64-NEXT: vcvttss2si %xmm0, %rax +; AVX1-X64-NEXT: vcomiss %xmm1, %xmm0 +; AVX1-X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-X64-NEXT: jb .LBB9_2 +; AVX1-X64-NEXT: # %bb.1: +; AVX1-X64-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-X64-NEXT: .LBB9_2: +; AVX1-X64-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-X64-NEXT: setae %al +; AVX1-X64-NEXT: movzbl %al, %eax +; AVX1-X64-NEXT: shlq $63, %rax ; AVX1-X64-NEXT: xorq %rcx, %rax ; AVX1-X64-NEXT: retq ; @@ -623,10 +634,11 @@ ; AVX512-X86-NEXT: subl $8, %esp ; AVX512-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512-X86-NEXT: vcmpltss %xmm1, %xmm0, %k1 -; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-X86-NEXT: xorl %edx, %edx -; AVX512-X86-NEXT: vucomiss %xmm0, %xmm1 +; AVX512-X86-NEXT: vcomiss %xmm0, %xmm1 +; AVX512-X86-NEXT: seta %al +; AVX512-X86-NEXT: kmovw %eax, %k1 +; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-X86-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512-X86-NEXT: vmovss %xmm0, (%esp) @@ -657,7 +669,7 @@ ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: flds 8(%ebp) ; CHECK-NEXT: flds {{\.LCPI.*}} -; CHECK-NEXT: fucom %st(1) +; CHECK-NEXT: fcom %st(1) ; CHECK-NEXT: fnstsw %ax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: # kill: def $ah killed $ah killed $ax @@ -1054,15 +1066,17 @@ ; SSE-X86-LABEL: fptoui_f64toi32: ; SSE-X86: # %bb.0: ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-X86-NEXT: xorl %ecx, %ecx -; SSE-X86-NEXT: ucomisd %xmm0, %xmm1 -; SSE-X86-NEXT: setbe %cl +; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-X86-NEXT: comisd %xmm0, %xmm2 +; SSE-X86-NEXT: xorpd %xmm1, %xmm1 +; SSE-X86-NEXT: ja .LBB17_2 +; SSE-X86-NEXT: # %bb.1: +; SSE-X86-NEXT: movapd %xmm2, %xmm1 +; SSE-X86-NEXT: .LBB17_2: +; SSE-X86-NEXT: setbe %al +; SSE-X86-NEXT: movzbl %al, %ecx ; SSE-X86-NEXT: shll $31, %ecx -; SSE-X86-NEXT: movapd %xmm0, %xmm2 -; SSE-X86-NEXT: cmpltsd %xmm1, %xmm2 -; SSE-X86-NEXT: andnpd %xmm1, %xmm2 -; SSE-X86-NEXT: subsd %xmm2, %xmm0 +; SSE-X86-NEXT: subsd %xmm1, %xmm0 ; SSE-X86-NEXT: cvttsd2si %xmm0, %eax ; SSE-X86-NEXT: xorl %ecx, %eax ; SSE-X86-NEXT: retl @@ -1146,24 +1160,25 @@ ; SSE-X86-NEXT: andl $-8, %esp ; SSE-X86-NEXT: subl $16, %esp ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-X86-NEXT: movapd %xmm0, %xmm2 -; SSE-X86-NEXT: cmpltsd %xmm1, %xmm2 -; SSE-X86-NEXT: andnpd %xmm1, %xmm2 -; SSE-X86-NEXT: movapd %xmm0, %xmm3 -; SSE-X86-NEXT: subsd %xmm2, %xmm3 -; SSE-X86-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-X86-NEXT: comisd %xmm0, %xmm2 +; SSE-X86-NEXT: xorpd %xmm1, %xmm1 +; SSE-X86-NEXT: ja .LBB18_2 +; SSE-X86-NEXT: # %bb.1: +; SSE-X86-NEXT: movapd %xmm2, %xmm1 +; SSE-X86-NEXT: .LBB18_2: +; SSE-X86-NEXT: subsd %xmm1, %xmm0 +; SSE-X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: setbe %al ; SSE-X86-NEXT: fldl {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-X86-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-X86-NEXT: movw %cx, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) -; SSE-X86-NEXT: xorl %edx, %edx -; SSE-X86-NEXT: ucomisd %xmm0, %xmm1 -; SSE-X86-NEXT: setbe %dl +; SSE-X86-NEXT: movzbl %al, %edx ; SSE-X86-NEXT: shll $31, %edx ; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1174,16 +1189,18 @@ ; ; SSE-X64-LABEL: fptoui_f64toi64: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-X64-NEXT: xorl %ecx, %ecx -; SSE-X64-NEXT: ucomisd %xmm1, %xmm0 -; SSE-X64-NEXT: setae %cl -; SSE-X64-NEXT: shlq $63, %rcx -; SSE-X64-NEXT: movapd %xmm0, %xmm2 -; SSE-X64-NEXT: cmpltsd %xmm1, %xmm2 -; SSE-X64-NEXT: andnpd %xmm1, %xmm2 -; SSE-X64-NEXT: subsd %xmm2, %xmm0 -; SSE-X64-NEXT: cvttsd2si %xmm0, %rax +; SSE-X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-X64-NEXT: comisd %xmm2, %xmm0 +; SSE-X64-NEXT: xorpd %xmm1, %xmm1 +; SSE-X64-NEXT: jb .LBB18_2 +; SSE-X64-NEXT: # %bb.1: +; SSE-X64-NEXT: movapd %xmm2, %xmm1 +; SSE-X64-NEXT: .LBB18_2: +; SSE-X64-NEXT: subsd %xmm1, %xmm0 +; SSE-X64-NEXT: cvttsd2si %xmm0, %rcx +; SSE-X64-NEXT: setae %al +; SSE-X64-NEXT: movzbl %al, %eax +; SSE-X64-NEXT: shlq $63, %rax ; SSE-X64-NEXT: xorq %rcx, %rax ; SSE-X64-NEXT: retq ; @@ -1198,15 +1215,18 @@ ; AVX1-X86-NEXT: subl $8, %esp ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-X86-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 -; AVX1-X86-NEXT: vandnpd %xmm1, %xmm2, %xmm2 -; AVX1-X86-NEXT: vsubsd %xmm2, %xmm0, %xmm2 -; AVX1-X86-NEXT: vmovsd %xmm2, (%esp) +; AVX1-X86-NEXT: vcomisd %xmm0, %xmm1 +; AVX1-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-X86-NEXT: ja .LBB18_2 +; AVX1-X86-NEXT: # %bb.1: +; AVX1-X86-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-X86-NEXT: .LBB18_2: +; AVX1-X86-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX1-X86-NEXT: fldl (%esp) ; AVX1-X86-NEXT: fisttpll (%esp) -; AVX1-X86-NEXT: xorl %edx, %edx -; AVX1-X86-NEXT: vucomisd %xmm0, %xmm1 -; AVX1-X86-NEXT: setbe %dl +; AVX1-X86-NEXT: setbe %al +; AVX1-X86-NEXT: movzbl %al, %edx ; AVX1-X86-NEXT: shll $31, %edx ; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX1-X86-NEXT: movl (%esp), %eax @@ -1218,14 +1238,17 @@ ; AVX1-X64-LABEL: fptoui_f64toi64: ; AVX1-X64: # %bb.0: ; AVX1-X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-X64-NEXT: xorl %ecx, %ecx -; AVX1-X64-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-X64-NEXT: setae %cl -; AVX1-X64-NEXT: shlq $63, %rcx -; AVX1-X64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 -; AVX1-X64-NEXT: vandnpd %xmm1, %xmm2, %xmm1 -; AVX1-X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX1-X64-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-X64-NEXT: vcomisd %xmm1, %xmm0 +; AVX1-X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-X64-NEXT: jb .LBB18_2 +; AVX1-X64-NEXT: # %bb.1: +; AVX1-X64-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-X64-NEXT: .LBB18_2: +; AVX1-X64-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-X64-NEXT: setae %al +; AVX1-X64-NEXT: movzbl %al, %eax +; AVX1-X64-NEXT: shlq $63, %rax ; AVX1-X64-NEXT: xorq %rcx, %rax ; AVX1-X64-NEXT: retq ; @@ -1240,10 +1263,11 @@ ; AVX512-X86-NEXT: subl $8, %esp ; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-X86-NEXT: vcmpltsd %xmm1, %xmm0, %k1 -; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-X86-NEXT: xorl %edx, %edx -; AVX512-X86-NEXT: vucomisd %xmm0, %xmm1 +; AVX512-X86-NEXT: vcomisd %xmm0, %xmm1 +; AVX512-X86-NEXT: seta %al +; AVX512-X86-NEXT: kmovw %eax, %k1 +; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512-X86-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512-X86-NEXT: vmovsd %xmm0, (%esp) @@ -1274,7 +1298,7 @@ ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: fldl 8(%ebp) ; CHECK-NEXT: flds {{\.LCPI.*}} -; CHECK-NEXT: fucom %st(1) +; CHECK-NEXT: fcom %st(1) ; CHECK-NEXT: fnstsw %ax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: # kill: def $ah killed $ah killed $ax Index: llvm/test/CodeGen/X86/fp80-strict-scalar.ll =================================================================== --- llvm/test/CodeGen/X86/fp80-strict-scalar.ll +++ llvm/test/CodeGen/X86/fp80-strict-scalar.ll @@ -543,7 +543,7 @@ ; X86-NEXT: subl $16, %esp ; X86-NEXT: fldt 8(%ebp) ; X86-NEXT: flds {{\.LCPI.*}} -; X86-NEXT: fucom %st(1) +; X86-NEXT: fcom %st(1) ; X86-NEXT: fnstsw %ax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: # kill: def $ah killed $ah killed $ax @@ -579,7 +579,7 @@ ; X64-NEXT: fldt {{[0-9]+}}(%rsp) ; X64-NEXT: flds {{.*}}(%rip) ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: fucomi %st(1), %st +; X64-NEXT: fcomi %st(1), %st ; X64-NEXT: setbe %al ; X64-NEXT: fldz ; X64-NEXT: fxch %st(1) Index: llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -176,7 +176,7 @@ ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp ; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: comisd %xmm2, %xmm0 ; SSE-32-NEXT: xorpd %xmm1, %xmm1 ; SSE-32-NEXT: xorpd %xmm3, %xmm3 ; SSE-32-NEXT: jb .LBB1_2 @@ -196,7 +196,7 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: comisd %xmm2, %xmm0 ; SSE-32-NEXT: jb .LBB1_4 ; SSE-32-NEXT: # %bb.3: ; SSE-32-NEXT: movapd %xmm2, %xmm1 @@ -232,29 +232,33 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movapd %xmm0, %xmm1 -; SSE-64-NEXT: cmpltsd %xmm2, %xmm1 -; SSE-64-NEXT: andnpd %xmm2, %xmm1 -; SSE-64-NEXT: movapd %xmm0, %xmm3 -; SSE-64-NEXT: subsd %xmm1, %xmm3 -; SSE-64-NEXT: cvttsd2si %xmm3, %rcx +; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: xorpd %xmm2, %xmm2 +; SSE-64-NEXT: xorpd %xmm1, %xmm1 +; SSE-64-NEXT: jb .LBB1_2 +; SSE-64-NEXT: # %bb.1: +; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: .LBB1_2: +; SSE-64-NEXT: movapd %xmm0, %xmm4 +; SSE-64-NEXT: subsd %xmm1, %xmm4 +; SSE-64-NEXT: cvttsd2si %xmm4, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movapd %xmm0, %xmm3 -; SSE-64-NEXT: cmpltsd %xmm2, %xmm3 -; SSE-64-NEXT: andnpd %xmm2, %xmm3 -; SSE-64-NEXT: subsd %xmm3, %xmm0 -; SSE-64-NEXT: cvttsd2si %xmm0, %rcx +; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: jb .LBB1_4 +; SSE-64-NEXT: # %bb.3: +; SSE-64-NEXT: movapd %xmm3, %xmm2 +; SSE-64-NEXT: .LBB1_4: +; SSE-64-NEXT: subsd %xmm2, %xmm0 +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm0 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -272,7 +276,7 @@ ; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB1_2 @@ -287,7 +291,7 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX-32-NEXT: jb .LBB1_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 @@ -312,28 +316,34 @@ ; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 -; AVX-64-NEXT: vandnpd %xmm1, %xmm2, %xmm2 -; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm2 -; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX-64-NEXT: jb .LBB1_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovapd %xmm1, %xmm3 +; AVX-64-NEXT: .LBB1_2: +; AVX-64-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttsd2si %xmm3, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovq %rcx, %xmm3 ; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm1 -; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB1_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovapd %xmm1, %xmm2 +; AVX-64-NEXT: .LBB1_4: +; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: @@ -348,7 +358,7 @@ ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vucomisd %xmm2, %xmm1 +; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm1 ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 @@ -362,7 +372,7 @@ ; AVX512VL-32-NEXT: shll $31, %eax ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomisd %xmm2, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm0 ; AVX512VL-32-NEXT: setb %dl ; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} @@ -559,48 +569,50 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-32-NEXT: movaps %xmm0, %xmm2 -; SSE-32-NEXT: cmpltss %xmm1, %xmm2 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: subss %xmm2, %xmm3 -; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) -; SSE-32-NEXT: movaps %xmm0, %xmm2 -; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] +; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: xorps %xmm1, %xmm1 +; SSE-32-NEXT: xorps %xmm3, %xmm3 +; SSE-32-NEXT: jb .LBB3_2 +; SSE-32-NEXT: # %bb.1: ; SSE-32-NEXT: movaps %xmm2, %xmm3 -; SSE-32-NEXT: cmpltss %xmm1, %xmm3 -; SSE-32-NEXT: andnps %xmm1, %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: .LBB3_2: +; SSE-32-NEXT: movaps %xmm0, %xmm4 ; SSE-32-NEXT: subss %xmm3, %xmm4 ; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %al ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: jb .LBB3_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: .LBB3_4: +; SSE-32-NEXT: subss %xmm1, %xmm0 +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: fnstcw (%esp) -; SSE-32-NEXT: movzwl (%esp), %eax -; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl (%esp), %edx +; SSE-32-NEXT: orl $3072, %edx # imm = 0xC00 +; SSE-32-NEXT: movw %dx, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm1, %xmm0 -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax ; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm1, %xmm2 -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-32-NEXT: movzbl %cl, %eax ; SSE-32-NEXT: shll $31, %eax ; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 @@ -614,29 +626,33 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomiss %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movaps %xmm0, %xmm1 -; SSE-64-NEXT: cmpltss %xmm2, %xmm1 -; SSE-64-NEXT: andnps %xmm2, %xmm1 -; SSE-64-NEXT: movaps %xmm0, %xmm3 -; SSE-64-NEXT: subss %xmm1, %xmm3 -; SSE-64-NEXT: cvttss2si %xmm3, %rcx +; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: xorps %xmm2, %xmm2 +; SSE-64-NEXT: xorps %xmm1, %xmm1 +; SSE-64-NEXT: jb .LBB3_2 +; SSE-64-NEXT: # %bb.1: +; SSE-64-NEXT: movaps %xmm3, %xmm1 +; SSE-64-NEXT: .LBB3_2: +; SSE-64-NEXT: movaps %xmm0, %xmm4 +; SSE-64-NEXT: subss %xmm1, %xmm4 +; SSE-64-NEXT: cvttss2si %xmm4, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomiss %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movaps %xmm0, %xmm3 -; SSE-64-NEXT: cmpltss %xmm2, %xmm3 -; SSE-64-NEXT: andnps %xmm2, %xmm3 -; SSE-64-NEXT: subss %xmm3, %xmm0 -; SSE-64-NEXT: cvttss2si %xmm0, %rcx +; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: jb .LBB3_4 +; SSE-64-NEXT: # %bb.3: +; SSE-64-NEXT: movaps %xmm3, %xmm2 +; SSE-64-NEXT: .LBB3_4: +; SSE-64-NEXT: subss %xmm2, %xmm0 +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm0 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -652,28 +668,34 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 -; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 -; AVX-32-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB3_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: .LBB3_2: +; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 -; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 -; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, (%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) -; AVX-32-NEXT: xorl %eax, %eax -; AVX-32-NEXT: vucomiss %xmm2, %xmm1 ; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: xorl %ecx, %ecx -; AVX-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB3_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: .LBB3_4: +; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -688,28 +710,34 @@ ; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 -; AVX-64-NEXT: vandnps %xmm1, %xmm2, %xmm2 -; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm2 -; AVX-64-NEXT: vcvttss2si %xmm2, %rcx +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-64-NEXT: jb .LBB3_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovaps %xmm1, %xmm3 +; AVX-64-NEXT: .LBB3_2: +; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttss2si %xmm3, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovq %rcx, %xmm3 ; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm1 -; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB3_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovaps %xmm1, %xmm2 +; AVX-64-NEXT: .LBB3_4: +; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: @@ -723,28 +751,29 @@ ; AVX512VL-32-NEXT: subl $16, %esp ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vcmpltss %xmm2, %xmm1, %k1 +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 ; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vcmpltss %xmm2, %xmm0, %k1 -; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 -; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} -; AVX512VL-32-NEXT: vsubss %xmm4, %xmm0, %xmm3 -; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: flds (%esp) -; AVX512VL-32-NEXT: fisttpll (%esp) -; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vucomiss %xmm2, %xmm1 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: shll $31, %eax ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, (%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) ; AVX512VL-32-NEXT: setae %cl ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx @@ -828,28 +857,32 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 { ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; SSE-32: # %bb.0: -; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-32-NEXT: comisd %xmm3, %xmm0 +; SSE-32-NEXT: xorpd %xmm2, %xmm2 +; SSE-32-NEXT: xorpd %xmm1, %xmm1 +; SSE-32-NEXT: jb .LBB5_2 +; SSE-32-NEXT: # %bb.1: +; SSE-32-NEXT: movapd %xmm3, %xmm1 +; SSE-32-NEXT: .LBB5_2: ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movapd %xmm0, %xmm1 -; SSE-32-NEXT: cmpltsd %xmm2, %xmm1 -; SSE-32-NEXT: andnpd %xmm2, %xmm1 -; SSE-32-NEXT: movapd %xmm0, %xmm3 -; SSE-32-NEXT: subsd %xmm1, %xmm3 -; SSE-32-NEXT: cvttsd2si %xmm3, %ecx +; SSE-32-NEXT: movapd %xmm0, %xmm4 +; SSE-32-NEXT: subsd %xmm1, %xmm4 +; SSE-32-NEXT: cvttsd2si %xmm4, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm1 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: comisd %xmm3, %xmm0 +; SSE-32-NEXT: jb .LBB5_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movapd %xmm3, %xmm2 +; SSE-32-NEXT: .LBB5_4: ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movapd %xmm0, %xmm3 -; SSE-32-NEXT: cmpltsd %xmm2, %xmm3 -; SSE-32-NEXT: andnpd %xmm2, %xmm3 -; SSE-32-NEXT: subsd %xmm3, %xmm0 +; SSE-32-NEXT: subsd %xmm2, %xmm0 ; SSE-32-NEXT: cvttsd2si %xmm0, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm0 @@ -978,28 +1011,32 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 { ; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32: ; SSE-32: # %bb.0: -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm3, %xmm0 +; SSE-32-NEXT: xorps %xmm2, %xmm2 +; SSE-32-NEXT: xorps %xmm1, %xmm1 +; SSE-32-NEXT: jb .LBB7_2 +; SSE-32-NEXT: # %bb.1: +; SSE-32-NEXT: movaps %xmm3, %xmm1 +; SSE-32-NEXT: .LBB7_2: ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movaps %xmm0, %xmm1 -; SSE-32-NEXT: cmpltss %xmm2, %xmm1 -; SSE-32-NEXT: andnps %xmm2, %xmm1 -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: subss %xmm1, %xmm3 -; SSE-32-NEXT: cvttss2si %xmm3, %ecx +; SSE-32-NEXT: movaps %xmm0, %xmm4 +; SSE-32-NEXT: subss %xmm1, %xmm4 +; SSE-32-NEXT: cvttss2si %xmm4, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm1 ; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: comiss %xmm3, %xmm0 +; SSE-32-NEXT: jb .LBB7_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movaps %xmm3, %xmm2 +; SSE-32-NEXT: .LBB7_4: ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: cmpltss %xmm2, %xmm3 -; SSE-32-NEXT: andnps %xmm2, %xmm3 -; SSE-32-NEXT: subss %xmm3, %xmm0 +; SSE-32-NEXT: subss %xmm2, %xmm0 ; SSE-32-NEXT: cvttss2si %xmm0, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm0 @@ -1542,7 +1579,7 @@ ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp ; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: comisd %xmm2, %xmm0 ; SSE-32-NEXT: xorpd %xmm1, %xmm1 ; SSE-32-NEXT: xorpd %xmm3, %xmm3 ; SSE-32-NEXT: jb .LBB17_2 @@ -1562,7 +1599,7 @@ ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: comisd %xmm2, %xmm0 ; SSE-32-NEXT: jb .LBB17_4 ; SSE-32-NEXT: # %bb.3: ; SSE-32-NEXT: movapd %xmm2, %xmm1 @@ -1598,29 +1635,33 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movapd %xmm0, %xmm1 -; SSE-64-NEXT: cmpltsd %xmm2, %xmm1 -; SSE-64-NEXT: andnpd %xmm2, %xmm1 -; SSE-64-NEXT: movapd %xmm0, %xmm3 -; SSE-64-NEXT: subsd %xmm1, %xmm3 -; SSE-64-NEXT: cvttsd2si %xmm3, %rcx +; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: xorpd %xmm2, %xmm2 +; SSE-64-NEXT: xorpd %xmm1, %xmm1 +; SSE-64-NEXT: jb .LBB17_2 +; SSE-64-NEXT: # %bb.1: +; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: .LBB17_2: +; SSE-64-NEXT: movapd %xmm0, %xmm4 +; SSE-64-NEXT: subsd %xmm1, %xmm4 +; SSE-64-NEXT: cvttsd2si %xmm4, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movapd %xmm0, %xmm3 -; SSE-64-NEXT: cmpltsd %xmm2, %xmm3 -; SSE-64-NEXT: andnpd %xmm2, %xmm3 -; SSE-64-NEXT: subsd %xmm3, %xmm0 -; SSE-64-NEXT: cvttsd2si %xmm0, %rcx +; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: jb .LBB17_4 +; SSE-64-NEXT: # %bb.3: +; SSE-64-NEXT: movapd %xmm3, %xmm2 +; SSE-64-NEXT: .LBB17_4: +; SSE-64-NEXT: subsd %xmm2, %xmm0 +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm0 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1638,7 +1679,7 @@ ; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB17_2 @@ -1653,7 +1694,7 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX-32-NEXT: jb .LBB17_4 ; AVX-32-NEXT: # %bb.3: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 @@ -1678,28 +1719,34 @@ ; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 -; AVX-64-NEXT: vandnpd %xmm1, %xmm2, %xmm2 -; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm2 -; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX-64-NEXT: jb .LBB17_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovapd %xmm1, %xmm3 +; AVX-64-NEXT: .LBB17_2: +; AVX-64-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttsd2si %xmm3, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovq %rcx, %xmm3 ; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm1 -; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB17_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovapd %xmm1, %xmm2 +; AVX-64-NEXT: .LBB17_4: +; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX-64-NEXT: retq ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i1: @@ -1863,48 +1910,50 @@ ; SSE-32-NEXT: .cfi_def_cfa_register %ebp ; SSE-32-NEXT: andl $-8, %esp ; SSE-32-NEXT: subl $24, %esp -; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-32-NEXT: movaps %xmm0, %xmm2 -; SSE-32-NEXT: cmpltss %xmm1, %xmm2 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: subss %xmm2, %xmm3 -; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) -; SSE-32-NEXT: movaps %xmm0, %xmm2 -; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] +; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: xorps %xmm1, %xmm1 +; SSE-32-NEXT: xorps %xmm3, %xmm3 +; SSE-32-NEXT: jb .LBB19_2 +; SSE-32-NEXT: # %bb.1: ; SSE-32-NEXT: movaps %xmm2, %xmm3 -; SSE-32-NEXT: cmpltss %xmm1, %xmm3 -; SSE-32-NEXT: andnps %xmm1, %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: .LBB19_2: +; SSE-32-NEXT: movaps %xmm0, %xmm4 ; SSE-32-NEXT: subss %xmm3, %xmm4 ; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %al ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) -; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: comiss %xmm2, %xmm0 +; SSE-32-NEXT: jb .LBB19_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movaps %xmm2, %xmm1 +; SSE-32-NEXT: .LBB19_4: +; SSE-32-NEXT: subss %xmm1, %xmm0 +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %cl ; SSE-32-NEXT: flds {{[0-9]+}}(%esp) ; SSE-32-NEXT: fnstcw (%esp) -; SSE-32-NEXT: movzwl (%esp), %eax -; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl (%esp), %edx +; SSE-32-NEXT: orl $3072, %edx # imm = 0xC00 +; SSE-32-NEXT: movw %dx, {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) ; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) ; SSE-32-NEXT: fldcw (%esp) -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm1, %xmm0 -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax ; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm1, %xmm2 -; SSE-32-NEXT: setae %al +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-32-NEXT: movzbl %cl, %eax ; SSE-32-NEXT: shll $31, %eax ; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movd %eax, %xmm1 @@ -1918,29 +1967,33 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomiss %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movaps %xmm0, %xmm1 -; SSE-64-NEXT: cmpltss %xmm2, %xmm1 -; SSE-64-NEXT: andnps %xmm2, %xmm1 -; SSE-64-NEXT: movaps %xmm0, %xmm3 -; SSE-64-NEXT: subss %xmm1, %xmm3 -; SSE-64-NEXT: cvttss2si %xmm3, %rcx +; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: xorps %xmm2, %xmm2 +; SSE-64-NEXT: xorps %xmm1, %xmm1 +; SSE-64-NEXT: jb .LBB19_2 +; SSE-64-NEXT: # %bb.1: +; SSE-64-NEXT: movaps %xmm3, %xmm1 +; SSE-64-NEXT: .LBB19_2: +; SSE-64-NEXT: movaps %xmm0, %xmm4 +; SSE-64-NEXT: subss %xmm1, %xmm4 +; SSE-64-NEXT: cvttss2si %xmm4, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomiss %xmm2, %xmm0 -; SSE-64-NEXT: setae %al -; SSE-64-NEXT: shlq $63, %rax -; SSE-64-NEXT: movaps %xmm0, %xmm3 -; SSE-64-NEXT: cmpltss %xmm2, %xmm3 -; SSE-64-NEXT: andnps %xmm2, %xmm3 -; SSE-64-NEXT: subss %xmm3, %xmm0 -; SSE-64-NEXT: cvttss2si %xmm0, %rcx +; SSE-64-NEXT: comiss %xmm3, %xmm0 +; SSE-64-NEXT: jb .LBB19_4 +; SSE-64-NEXT: # %bb.3: +; SSE-64-NEXT: movaps %xmm3, %xmm2 +; SSE-64-NEXT: .LBB19_4: +; SSE-64-NEXT: subss %xmm2, %xmm0 +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: setae %cl +; SSE-64-NEXT: movzbl %cl, %ecx +; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx ; SSE-64-NEXT: movq %rcx, %xmm0 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1956,28 +2009,34 @@ ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-32-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 -; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 -; AVX-32-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vcomiss %xmm1, %xmm3 +; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB19_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: .LBB19_2: +; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 -; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 -; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 -; AVX-32-NEXT: vmovss %xmm3, (%esp) ; AVX-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) -; AVX-32-NEXT: flds (%esp) -; AVX-32-NEXT: fisttpll (%esp) -; AVX-32-NEXT: xorl %eax, %eax -; AVX-32-NEXT: vucomiss %xmm2, %xmm1 ; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: xorl %ecx, %ecx -; AVX-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB19_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: .LBB19_4: +; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) ; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1992,28 +2051,34 @@ ; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 -; AVX-64-NEXT: vandnps %xmm1, %xmm2, %xmm2 -; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm2 -; AVX-64-NEXT: vcvttss2si %xmm2, %rcx +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-64-NEXT: jb .LBB19_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovaps %xmm1, %xmm3 +; AVX-64-NEXT: .LBB19_2: +; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttss2si %xmm3, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovq %rcx, %xmm3 ; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm1 -; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB19_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovaps %xmm1, %xmm2 +; AVX-64-NEXT: .LBB19_4: +; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX-64-NEXT: retq ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i1: @@ -2085,58 +2150,66 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movaps %xmm0, %xmm1 ; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm1 +; SSE-32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-32-NEXT: comiss %xmm3, %xmm1 +; SSE-32-NEXT: xorps %xmm2, %xmm2 +; SSE-32-NEXT: xorps %xmm4, %xmm4 +; SSE-32-NEXT: jb .LBB21_2 +; SSE-32-NEXT: # %bb.1: +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: .LBB21_2: ; SSE-32-NEXT: setae %al -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movaps %xmm1, %xmm3 -; SSE-32-NEXT: cmpltss %xmm2, %xmm3 -; SSE-32-NEXT: andnps %xmm2, %xmm3 -; SSE-32-NEXT: subss %xmm3, %xmm1 -; SSE-32-NEXT: cvttss2si %xmm1, %ecx -; SSE-32-NEXT: xorl %eax, %ecx -; SSE-32-NEXT: movd %ecx, %xmm1 -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm3 +; SSE-32-NEXT: movzbl %al, %ecx +; SSE-32-NEXT: shll $31, %ecx +; SSE-32-NEXT: subss %xmm4, %xmm1 +; SSE-32-NEXT: cvttss2si %xmm1, %eax +; SSE-32-NEXT: xorl %ecx, %eax +; SSE-32-NEXT: movaps %xmm0, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-32-NEXT: comiss %xmm3, %xmm4 +; SSE-32-NEXT: xorps %xmm5, %xmm5 +; SSE-32-NEXT: jb .LBB21_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: .LBB21_4: +; SSE-32-NEXT: movd %eax, %xmm1 ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: cmpltss %xmm2, %xmm4 -; SSE-32-NEXT: andnps %xmm2, %xmm4 -; SSE-32-NEXT: subss %xmm4, %xmm3 -; SSE-32-NEXT: cvttss2si %xmm3, %ecx +; SSE-32-NEXT: subss %xmm5, %xmm4 +; SSE-32-NEXT: cvttss2si %xmm4, %ecx ; SSE-32-NEXT: xorl %eax, %ecx -; SSE-32-NEXT: movd %ecx, %xmm3 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: movd %ecx, %xmm4 +; SSE-32-NEXT: comiss %xmm3, %xmm0 +; SSE-32-NEXT: xorps %xmm5, %xmm5 +; SSE-32-NEXT: jb .LBB21_6 +; SSE-32-NEXT: # %bb.5: +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: .LBB21_6: +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax ; SSE-32-NEXT: movaps %xmm0, %xmm1 -; SSE-32-NEXT: cmpltss %xmm2, %xmm1 -; SSE-32-NEXT: andnps %xmm2, %xmm1 -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: subss %xmm1, %xmm4 -; SSE-32-NEXT: cvttss2si %xmm4, %ecx +; SSE-32-NEXT: subss %xmm5, %xmm1 +; SSE-32-NEXT: cvttss2si %xmm1, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm1 ; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: comiss %xmm3, %xmm0 +; SSE-32-NEXT: jb .LBB21_8 +; SSE-32-NEXT: # %bb.7: +; SSE-32-NEXT: movaps %xmm3, %xmm2 +; SSE-32-NEXT: .LBB21_8: ; SSE-32-NEXT: setae %al +; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movaps %xmm0, %xmm4 -; SSE-32-NEXT: cmpltss %xmm2, %xmm4 -; SSE-32-NEXT: andnps %xmm2, %xmm4 -; SSE-32-NEXT: subss %xmm4, %xmm0 +; SSE-32-NEXT: subss %xmm2, %xmm0 ; SSE-32-NEXT: cvttss2si %xmm0, %ecx ; SSE-32-NEXT: xorl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm0 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-32-NEXT: movdqa %xmm1, %xmm0 ; SSE-32-NEXT: retl ; Index: llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -163,7 +163,7 @@ ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB1_2 @@ -180,7 +180,7 @@ ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX-32-NEXT: vcomisd %xmm1, %xmm4 ; AVX-32-NEXT: vxorpd %xmm5, %xmm5, %xmm5 ; AVX-32-NEXT: jb .LBB1_4 ; AVX-32-NEXT: # %bb.3: @@ -194,7 +194,7 @@ ; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB1_6 ; AVX-32-NEXT: # %bb.5: @@ -208,7 +208,7 @@ ; AVX-32-NEXT: movzbl %dl, %edx ; AVX-32-NEXT: shll $31, %edx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX-32-NEXT: jb .LBB1_8 ; AVX-32-NEXT: # %bb.7: ; AVX-32-NEXT: vmovapd %xmm1, %xmm2 @@ -237,53 +237,65 @@ ; ; AVX-64-LABEL: strict_vector_fptoui_v4f64_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm2 +; AVX-64-NEXT: vcomisd %xmm1, %xmm3 +; AVX-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-64-NEXT: jb .LBB1_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovapd %xmm1, %xmm4 +; AVX-64-NEXT: .LBB1_2: +; AVX-64-NEXT: vsubsd %xmm4, %xmm3, %xmm4 +; AVX-64-NEXT: vcvttsd2si %xmm4, %rcx ; AVX-64-NEXT: setae %al +; AVX-64-NEXT: movzbl %al, %eax ; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm2, %xmm3 -; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm3 -; AVX-64-NEXT: vsubsd %xmm3, %xmm2, %xmm3 -; AVX-64-NEXT: vcvttsd2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rcx, %rax +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX-64-NEXT: vcomisd %xmm1, %xmm4 +; AVX-64-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX-64-NEXT: jb .LBB1_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovapd %xmm1, %xmm5 +; AVX-64-NEXT: .LBB1_4: +; AVX-64-NEXT: vmovq %rax, %xmm3 +; AVX-64-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX-64-NEXT: vcvttsd2si %xmm4, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm3 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm2 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm2, %xmm4 -; AVX-64-NEXT: vandnpd %xmm1, %xmm4, %xmm4 -; AVX-64-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx -; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm3 -; AVX-64-NEXT: vsubsd %xmm3, %xmm0, %xmm3 -; AVX-64-NEXT: vcvttsd2si %xmm3, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm4 +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX-64-NEXT: jb .LBB1_6 +; AVX-64-NEXT: # %bb.5: +; AVX-64-NEXT: vmovapd %xmm1, %xmm5 +; AVX-64-NEXT: .LBB1_6: +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX-64-NEXT: vsubsd %xmm5, %xmm0, %xmm4 +; AVX-64-NEXT: vcvttsd2si %xmm4, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vmovq %rcx, %xmm4 ; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm4 -; AVX-64-NEXT: vandnpd %xmm1, %xmm4, %xmm1 -; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: vcomisd %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB1_8 +; AVX-64-NEXT: # %bb.7: +; AVX-64-NEXT: vmovapd %xmm1, %xmm2 +; AVX-64-NEXT: .LBB1_8: +; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v4f64_to_v4i64: @@ -301,7 +313,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 @@ -319,7 +331,7 @@ ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %dl ; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 @@ -332,7 +344,7 @@ ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 @@ -345,7 +357,7 @@ ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} @@ -532,7 +544,7 @@ ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB3_2 @@ -548,7 +560,7 @@ ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB3_4 ; AVX-32-NEXT: # %bb.3: @@ -563,7 +575,7 @@ ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-32-NEXT: jb .LBB3_6 ; AVX-32-NEXT: # %bb.5: @@ -577,7 +589,7 @@ ; AVX-32-NEXT: movzbl %dl, %edx ; AVX-32-NEXT: shll $31, %edx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx -; AVX-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX-32-NEXT: jb .LBB3_8 ; AVX-32-NEXT: # %bb.7: ; AVX-32-NEXT: vmovaps %xmm1, %xmm2 @@ -606,53 +618,65 @@ ; ; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX-64-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm2 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm2, %xmm3 -; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm3 -; AVX-64-NEXT: vsubss %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vcvttss2si %xmm2, %rcx -; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm2 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm3 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm3, %xmm4 -; AVX-64-NEXT: vandnps %xmm1, %xmm4, %xmm4 +; AVX-64-NEXT: vcomiss %xmm1, %xmm3 +; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-64-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-64-NEXT: jb .LBB3_2 +; AVX-64-NEXT: # %bb.1: +; AVX-64-NEXT: vmovaps %xmm1, %xmm4 +; AVX-64-NEXT: .LBB3_2: ; AVX-64-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-64-NEXT: vcvttss2si %xmm3, %rcx -; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm3 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 ; AVX-64-NEXT: setae %al +; AVX-64-NEXT: movzbl %al, %eax ; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 -; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm3 -; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3 -; AVX-64-NEXT: vcvttss2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rcx, %rax +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-64-NEXT: vcomiss %xmm1, %xmm4 +; AVX-64-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX-64-NEXT: jb .LBB3_4 +; AVX-64-NEXT: # %bb.3: +; AVX-64-NEXT: vmovaps %xmm1, %xmm5 +; AVX-64-NEXT: .LBB3_4: +; AVX-64-NEXT: vmovq %rax, %xmm3 +; AVX-64-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX-64-NEXT: vcvttss2si %xmm4, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx -; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vmovq %rcx, %xmm4 +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX-64-NEXT: jb .LBB3_6 +; AVX-64-NEXT: # %bb.5: +; AVX-64-NEXT: vmovaps %xmm1, %xmm5 +; AVX-64-NEXT: .LBB3_6: +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX-64-NEXT: vsubss %xmm5, %xmm0, %xmm4 +; AVX-64-NEXT: vcvttss2si %xmm4, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm4 ; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-64-NEXT: xorl %eax, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: setae %al -; AVX-64-NEXT: shlq $63, %rax -; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm4 -; AVX-64-NEXT: vandnps %xmm1, %xmm4, %xmm1 -; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vcomiss %xmm1, %xmm0 +; AVX-64-NEXT: jb .LBB3_8 +; AVX-64-NEXT: # %bb.7: +; AVX-64-NEXT: vmovaps %xmm1, %xmm2 +; AVX-64-NEXT: .LBB3_8: +; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: setae %cl +; AVX-64-NEXT: movzbl %cl, %ecx +; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v4f32_to_v4i64: @@ -670,7 +694,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 ; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -687,7 +711,7 @@ ; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %dl ; AVX512VL-32-NEXT: kmovw %edx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -701,7 +725,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -714,7 +738,7 @@ ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} Index: llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -150,7 +150,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 @@ -167,7 +167,7 @@ ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 @@ -181,7 +181,7 @@ ; AVX512VL-32-NEXT: shll $31, %eax ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 @@ -197,7 +197,7 @@ ; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 @@ -212,7 +212,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 @@ -227,7 +227,7 @@ ; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 @@ -240,7 +240,7 @@ ; AVX512VL-32-NEXT: shll $31, %ecx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 @@ -252,7 +252,7 @@ ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: shll $31, %eax ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} @@ -454,7 +454,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 @@ -470,7 +470,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -485,7 +485,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -501,7 +501,7 @@ ; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 @@ -516,7 +516,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -530,7 +530,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm4 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm4 ; AVX512VL-32-NEXT: setb %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 @@ -544,7 +544,7 @@ ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 @@ -556,7 +556,7 @@ ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: shll $31, %eax ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setb %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} Index: llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -4465,18 +4465,38 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm0, %xmm2 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ja .LBB115_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: .LBB115_2: # %entry +; CHECK-NEXT: subss %xmm1, %xmm0 +; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vcomiss %xmm0, %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: ja .LBB115_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: .LBB115_2: # %entry +; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f32: @@ -4493,30 +4513,70 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB116_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: .LBB116_2: # %entry +; CHECK-NEXT: subss %xmm3, %xmm2 ; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm3, %xmm1 +; CHECK-NEXT: ja .LBB116_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: .LBB116_4: # %entry +; CHECK-NEXT: subss %xmm0, %xmm3 +; CHECK-NEXT: cvttss2si %xmm3, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm2, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB116_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm3 +; AVX1-NEXT: .LBB116_2: # %entry +; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm3, %xmm0 +; AVX1-NEXT: ja .LBB116_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: .LBB116_4: # %entry +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0 ; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f32: @@ -4537,35 +4597,95 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rdx +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rcx +; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB117_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: .LBB117_2: # %entry +; CHECK-NEXT: subss %xmm3, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB117_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: .LBB117_4: # %entry +; CHECK-NEXT: subss %xmm3, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %rcx +; CHECK-NEXT: setbe %dl +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: shlq $63, %rdx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm2, %xmm1 +; CHECK-NEXT: ja .LBB117_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: .LBB117_6: # %entry +; CHECK-NEXT: subss %xmm0, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %rsi +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rsi, %rcx ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm2, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB117_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm3 +; AVX1-NEXT: .LBB117_2: # %entry +; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: ja .LBB117_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm4 +; AVX1-NEXT: .LBB117_4: # %entry +; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm3, %xmm0 +; AVX1-NEXT: ja .LBB117_6 +; AVX1-NEXT: # %bb.5: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: .LBB117_6: # %entry +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0 ; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32: @@ -4590,49 +4710,129 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm1, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm0, %xmm2 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB118_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: .LBB118_2: # %entry +; CHECK-NEXT: subss %xmm3, %xmm0 +; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm1, %xmm0 +; CHECK-NEXT: comiss %xmm0, %xmm2 +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: ja .LBB118_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movaps %xmm2, %xmm4 +; CHECK-NEXT: .LBB118_4: # %entry +; CHECK-NEXT: movq %rax, %xmm3 +; CHECK-NEXT: subss %xmm4, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm1, %xmm2 -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: subss %xmm1, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm4, %xmm2 +; CHECK-NEXT: xorps %xmm5, %xmm5 +; CHECK-NEXT: ja .LBB118_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: .LBB118_6: # %entry +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: subss %xmm5, %xmm4 +; CHECK-NEXT: cvttss2si %xmm4, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm3 +; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: comiss %xmm4, %xmm2 +; CHECK-NEXT: ja .LBB118_8 +; CHECK-NEXT: # %bb.7: # %entry +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: .LBB118_8: # %entry +; CHECK-NEXT: subss %xmm1, %xmm4 +; CHECK-NEXT: cvttss2si %xmm4, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm2, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB118_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm3 +; AVX1-NEXT: .LBB118_2: # %entry +; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax ; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vcomiss %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: ja .LBB118_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm4 +; AVX1-NEXT: .LBB118_4: # %entry +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm4, %xmm0 +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: ja .LBB118_6 +; AVX1-NEXT: # %bb.5: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm5 +; AVX1-NEXT: .LBB118_6: # %entry +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX1-NEXT: vcomiss %xmm4, %xmm0 +; AVX1-NEXT: ja .LBB118_8 +; AVX1-NEXT: # %bb.7: # %entry +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: .LBB118_8: # %entry +; AVX1-NEXT: vsubss %xmm1, %xmm4, %xmm0 ; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f32: @@ -4810,18 +5010,38 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorpd %xmm0, %xmm0 -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: comisd %xmm0, %xmm2 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: ja .LBB123_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: .LBB123_2: # %entry +; CHECK-NEXT: subsd %xmm1, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vcomisd %xmm0, %xmm1 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: ja .LBB123_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovapd %xmm1, %xmm2 +; AVX1-NEXT: .LBB123_2: # %entry +; AVX1-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f64: @@ -4838,30 +5058,70 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorpd %xmm0, %xmm0 -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB124_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: .LBB124_2: # %entry +; CHECK-NEXT: subsd %xmm3, %xmm2 ; CHECK-NEXT: cvttsd2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: comisd %xmm3, %xmm1 +; CHECK-NEXT: ja .LBB124_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: .LBB124_4: # %entry +; CHECK-NEXT: subsd %xmm0, %xmm3 +; CHECK-NEXT: cvttsd2si %xmm3, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttsd2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB124_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm3 +; AVX1-NEXT: .LBB124_2: # %entry +; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttsd2si %xmm2, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm3, %xmm0 +; AVX1-NEXT: ja .LBB124_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm1 +; AVX1-NEXT: .LBB124_4: # %entry +; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0 ; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f64: @@ -4890,35 +5150,95 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorpd %xmm0, %xmm0 -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rdx +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rcx +; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB125_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: .LBB125_2: # %entry +; CHECK-NEXT: subsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB125_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movapd %xmm1, %xmm3 +; CHECK-NEXT: .LBB125_4: # %entry +; CHECK-NEXT: subsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rcx +; CHECK-NEXT: setbe %dl +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: shlq $63, %rdx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: comisd %xmm2, %xmm1 +; CHECK-NEXT: ja .LBB125_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: .LBB125_6: # %entry +; CHECK-NEXT: subsd %xmm0, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rsi +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rsi, %rcx ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttsd2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB125_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm3 +; AVX1-NEXT: .LBB125_2: # %entry +; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttsd2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm3, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: ja .LBB125_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm4 +; AVX1-NEXT: .LBB125_4: # %entry +; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm3, %xmm0 +; AVX1-NEXT: ja .LBB125_6 +; AVX1-NEXT: # %bb.5: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm1 +; AVX1-NEXT: .LBB125_6: # %entry +; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0 ; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64: @@ -4943,49 +5263,129 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: subsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: subsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: subsd %xmm1, %xmm2 -; CHECK-NEXT: cvttsd2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: subsd %xmm1, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: comisd %xmm0, %xmm2 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: ja .LBB126_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: .LBB126_2: # %entry +; CHECK-NEXT: subsd %xmm3, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %rcx +; CHECK-NEXT: setbe %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shlq $63, %rax +; CHECK-NEXT: xorq %rcx, %rax +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: comisd %xmm0, %xmm2 +; CHECK-NEXT: xorpd %xmm4, %xmm4 +; CHECK-NEXT: ja .LBB126_4 +; CHECK-NEXT: # %bb.3: # %entry +; CHECK-NEXT: movapd %xmm2, %xmm4 +; CHECK-NEXT: .LBB126_4: # %entry +; CHECK-NEXT: movq %rax, %xmm3 +; CHECK-NEXT: subsd %xmm4, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; CHECK-NEXT: comisd %xmm4, %xmm2 +; CHECK-NEXT: xorpd %xmm5, %xmm5 +; CHECK-NEXT: ja .LBB126_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: movapd %xmm2, %xmm5 +; CHECK-NEXT: .LBB126_6: # %entry +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: subsd %xmm5, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm3 +; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; CHECK-NEXT: comisd %xmm4, %xmm2 +; CHECK-NEXT: ja .LBB126_8 +; CHECK-NEXT: # %bb.7: # %entry +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: .LBB126_8: # %entry +; CHECK-NEXT: subsd %xmm1, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %rax +; CHECK-NEXT: setbe %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: shlq $63, %rcx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vcvttsd2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vcvttsd2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vcvttsd2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: ja .LBB126_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm3 +; AVX1-NEXT: .LBB126_2: # %entry +; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax ; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vcomisd %xmm3, %xmm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: ja .LBB126_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm4 +; AVX1-NEXT: .LBB126_4: # %entry +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm4, %xmm0 +; AVX1-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: ja .LBB126_6 +; AVX1-NEXT: # %bb.5: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm5 +; AVX1-NEXT: .LBB126_6: # %entry +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubsd %xmm5, %xmm4, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1-NEXT: vcomisd %xmm4, %xmm0 +; AVX1-NEXT: ja .LBB126_8 +; AVX1-NEXT: # %bb.7: # %entry +; AVX1-NEXT: vmovapd %xmm0, %xmm1 +; AVX1-NEXT: .LBB126_8: # %entry +; AVX1-NEXT: vsubsd %xmm1, %xmm4, %xmm0 ; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: setbe %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f64: @@ -6384,34 +6784,34 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: js .LBB170_1 -; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB170_1: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovnsq %rdi, %rcx +; CHECK-NEXT: cvtsi2ss %rcx, %xmm0 +; CHECK-NEXT: jns .LBB170_2 +; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: .LBB170_2: # %entry ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: testq %rdi, %rdi -; AVX1-NEXT: js .LBB170_1 -; AVX1-NEXT: # %bb.2: # %entry -; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB170_1: ; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: andl $1, %edi -; AVX1-NEXT: orq %rax, %rdi -; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: cmovnsq %rdi, %rcx +; AVX1-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; AVX1-NEXT: jns .LBB170_2 +; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB170_2: # %entry ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64: @@ -6548,74 +6948,65 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB174_1 -; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: jns .LBB174_5 -; CHECK-NEXT: .LBB174_4: ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: addss %xmm1, %xmm1 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB174_1: -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm0 +; CHECK-NEXT: jns .LBB174_2 +; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: .LBB174_2: # %entry ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB174_4 -; CHECK-NEXT: .LBB174_5: # %entry +; CHECK-NEXT: cmovnsq %rax, %rdx ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: jns .LBB174_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: .LBB174_4: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB174_1 -; AVX1-NEXT: # %bb.2: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: jns .LBB174_5 -; AVX1-NEXT: .LBB174_4: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB174_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 +; AVX1-NEXT: jns .LBB174_2 +; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB174_2: # %entry ; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB174_4 -; AVX1-NEXT: .LBB174_5: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0 +; AVX1-NEXT: jns .LBB174_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB174_4: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX1-NEXT: retq ; @@ -6805,100 +7196,90 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: js .LBB178_1 -; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: jns .LBB178_5 -; CHECK-NEXT: .LBB178_4: +; CHECK-NEXT: cmovnsq %rsi, %rcx +; CHECK-NEXT: cvtsi2ss %rcx, %xmm1 +; CHECK-NEXT: jns .LBB178_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: .LBB178_2: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovnsq %rdi, %rcx +; CHECK-NEXT: cvtsi2ss %rcx, %xmm0 +; CHECK-NEXT: jns .LBB178_4 +; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: .LBB178_4: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: jns .LBB178_8 -; CHECK-NEXT: .LBB178_7: ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andl $1, %edx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 -; CHECK-NEXT: addss %xmm1, %xmm1 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB178_1: -; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andl $1, %esi -; CHECK-NEXT: orq %rax, %rsi -; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 -; CHECK-NEXT: addss %xmm1, %xmm1 -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: js .LBB178_4 -; CHECK-NEXT: .LBB178_5: # %entry -; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: js .LBB178_7 -; CHECK-NEXT: .LBB178_8: # %entry +; CHECK-NEXT: cmovnsq %rdx, %rcx ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: cvtsi2ss %rcx, %xmm1 +; CHECK-NEXT: jns .LBB178_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: .LBB178_6: # %entry ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB178_1 -; AVX1-NEXT: # %bb.2: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: jns .LBB178_5 -; AVX1-NEXT: .LBB178_4: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB178_6 -; AVX1-NEXT: .LBB178_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 +; AVX1-NEXT: jns .LBB178_2 +; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB178_2: # %entry ; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB178_4 -; AVX1-NEXT: .LBB178_5: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: .LBB178_6: # %entry +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 +; AVX1-NEXT: jns .LBB178_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB178_4: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB178_7 -; AVX1-NEXT: # %bb.8: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB178_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 +; AVX1-NEXT: jns .LBB178_6 +; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB178_6: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6949,10 +7330,10 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -6978,12 +7359,12 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535] -; CHECK-NEXT: andps %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 ; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 -; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: mulps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: retq ; @@ -6991,10 +7372,10 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2ps %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2ps %xmm1, %xmm1 ; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -7078,73 +7459,62 @@ ; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB182_1 -; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: cvtsi2ss %rdx, %xmm2 +; CHECK-NEXT: jns .LBB182_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addss %xmm2, %xmm2 +; CHECK-NEXT: .LBB182_2: # %entry ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: jns .LBB182_5 -; CHECK-NEXT: .LBB182_4: ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: cvtsi2ss %rdx, %xmm3 +; CHECK-NEXT: jns .LBB182_4 +; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: addss %xmm3, %xmm3 +; CHECK-NEXT: .LBB182_4: # %entry ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: jns .LBB182_8 -; CHECK-NEXT: .LBB182_7: ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: addss %xmm1, %xmm1 -; CHECK-NEXT: jmp .LBB182_9 -; CHECK-NEXT: .LBB182_1: -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: cvtsi2ss %rax, %xmm2 -; CHECK-NEXT: addss %xmm2, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB182_4 -; CHECK-NEXT: .LBB182_5: # %entry -; CHECK-NEXT: cvtsi2ss %rax, %xmm3 -; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB182_7 -; CHECK-NEXT: .LBB182_8: # %entry +; CHECK-NEXT: cmovnsq %rax, %rdx ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: .LBB182_9: # %entry +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: jns .LBB182_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: .LBB182_6: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: js .LBB182_10 -; CHECK-NEXT: # %bb.11: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: retq -; CHECK-NEXT: .LBB182_10: ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm0 +; CHECK-NEXT: jns .LBB182_8 +; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: .LBB182_8: # %entry ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -7153,68 +7523,60 @@ ; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB182_1 -; AVX1-NEXT: # %bb.2: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: jns .LBB182_5 -; AVX1-NEXT: .LBB182_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB182_6 -; AVX1-NEXT: .LBB182_1: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 +; AVX1-NEXT: jns .LBB182_2 +; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB182_2: # %entry ; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB182_4 -; AVX1-NEXT: .LBB182_5: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: .LBB182_6: # %entry +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 +; AVX1-NEXT: jns .LBB182_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB182_4: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB182_7 -; AVX1-NEXT: # %bb.8: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: jns .LBB182_11 -; AVX1-NEXT: .LBB182_10: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB182_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2 +; AVX1-NEXT: jns .LBB182_6 +; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB182_6: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB182_10 -; AVX1-NEXT: .LBB182_11: # %entry -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 +; AVX1-NEXT: jns .LBB182_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB182_8: # %entry ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -7238,39 +7600,28 @@ ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm2, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpsrlq $1, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rax +; AVX512DQ-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax +; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm0, %rax ; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovq %xmm2, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vmovq %xmm2, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rax -; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; AVX512DQ-NEXT: vaddps %xmm2, %xmm2, %xmm2 -; AVX512DQ-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovq %xmm0, %rax +; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rax +; AVX512DQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX512DQ-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq entry: