diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -133,7 +133,7 @@ Changes to the X86 Backend -------------------------- -* ... +* Support ``half`` type on SSE2 and above targets. Changes to the OCaml bindings ----------------------------- diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -147,9 +147,8 @@ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { - return (VT == MVT::f64 && Subtarget->hasSSE2()) || - (VT == MVT::f32 && Subtarget->hasSSE1()) || - (VT == MVT::f16 && Subtarget->hasFP16()); + return ((VT == MVT::f16 || VT == MVT::f64) && Subtarget->hasSSE2()) || + (VT == MVT::f32 && Subtarget->hasSSE1()); } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); @@ -2281,12 +2280,13 @@ default: return false; case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; - case MVT::f16: Opc = X86::CMOV_FR16X; break; case MVT::i32: Opc = X86::CMOV_GR32; break; - case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X - : X86::CMOV_FR32; break; - case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X - : X86::CMOV_FR64; break; + case MVT::f16: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break; + case MVT::f32: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break; + case MVT::f64: + Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); @@ -3903,6 +3903,9 @@ unsigned Opc = 0; switch (VT.SimpleTy) { default: return 0; + case MVT::f16: + Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH; + break; case MVT::f32: Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : HasSSE1 ? X86::FsFLD0SS diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -553,8 +553,10 @@ setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { - // f32 and f64 use SSE. + // f16, f32 and f64 use SSE. // Set up the FP register classes. + addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass + : &X86::FR16RegClass); addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass @@ -586,6 +588,33 @@ setOperationAction(ISD::FSINCOS, VT, Expand); } + // Half type will be promoted by default. + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); @@ -660,6 +689,10 @@ } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } + // Support fp16 0 immediate. + if (isTypeLegal(MVT::f16)) + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); + // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); @@ -669,7 +702,7 @@ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); @@ -678,6 +711,7 @@ // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f16, Promote); // f80 always uses X87. if (UseX87) { @@ -721,7 +755,12 @@ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + if (isTypeLegal(MVT::f16)) { + setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); + } else { + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + } // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); @@ -1443,6 +1482,13 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + } + // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. @@ -3912,7 +3958,7 @@ else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f16) - RC = &X86::FR16XRegClass; + RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) @@ -5668,7 +5714,7 @@ bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || - (VT == MVT::f16 && Subtarget.hasFP16()); + (VT == MVT::f16 && Subtarget.hasBWI()); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { @@ -5680,8 +5726,7 @@ bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && Subtarget.hasSSE2()) || - (VT == MVT::f32 && Subtarget.hasSSE1()) || - (VT == MVT::f16 && Subtarget.hasFP16()); + (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; } bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, @@ -20764,6 +20809,9 @@ assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); + if (VT == MVT::f16 && !Subtarget.hasFP16()) + return SDValue(); + bool UseSSEReg = isScalarFPTypeInSSEReg(VT); // These are really Legal; return the operand so the caller accepts it as @@ -21229,7 +21277,7 @@ MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); - if (DstVT == MVT::f128) + if (DstVT == MVT::f128 || (DstVT == MVT::f16 && !Subtarget.hasFP16())) return SDValue(); if (DstVT.isVector()) @@ -22050,6 +22098,9 @@ MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); + if (SrcVT == MVT::f16 && !Subtarget.hasFP16()) + return SDValue(); + SDValue Res; if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { @@ -22388,6 +22439,9 @@ SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT == MVT::f128 || SrcVT == MVT::f16) + return SDValue(); + // If the source is in an SSE register, the node is Legal. if (isScalarFPTypeInSSEReg(SrcVT)) return Op; @@ -22459,7 +22513,8 @@ // This code is only for floats and doubles. Fall back to generic code for // anything else. - if (!isScalarFPTypeInSSEReg(SrcVT)) + if (!isScalarFPTypeInSSEReg(SrcVT) || + (SrcVT == MVT::f16 && !Subtarget.hasFP16())) return SDValue(); EVT SatVT = cast(Node->getOperand(1))->getVT(); @@ -22594,15 +22649,15 @@ SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); if (VT == MVT::f128) return SDValue(); - if (VT == MVT::f80) { - if (SVT == MVT::f16) { - assert(Subtarget.hasFP16() && "Unexpected features!"); + if (SVT == MVT::f16) { + if (VT == MVT::f80) { // FIXME: Just return SDValue() ? RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); MakeLibCallOptions CallOptions; std::pair Tmp = @@ -22613,9 +22668,42 @@ else return Tmp.first; } - return Op; + if (Subtarget.hasFP16()) + return Op; + if (VT != MVT::f32) { + if (IsStrict) + return DAG.getNode( + ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL, + {MVT::f32, MVT::Other}, {Chain, In})}); + + return DAG.getNode(ISD::FP_EXTEND, DL, VT, + DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In)); + } + + In = DAG.getBitcast(MVT::i16, In); + In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, + getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In, + DAG.getIntPtrConstant(0, DL)); + SDValue Res; + if (IsStrict) { + Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other}, + {Chain, In}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In, + DAG.getTargetConstant(4, DL, MVT::i32)); + } + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res, + DAG.getIntPtrConstant(0, DL)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; } + if (!SVT.isVector()) + return Op; + if (SVT.getVectorElementType() == MVT::f16) { assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); if (SVT == MVT::v2f16) @@ -22641,15 +22729,61 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); + + SDLoc DL(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); - // It's legal except when f128 is involved or we're converting f80->f16. - if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80)) - return Op; + if (SVT == MVT::f128) + return SDValue(); - return SDValue(); + if (VT == MVT::f16) { + if (Subtarget.hasFP16()) + return SVT == MVT::f80 ? SDValue() : Op; + + if (SVT != MVT::f32) { + if (IsStrict) + return DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, + DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other}, + {Chain, In, Op2}), + Op2}); + + return DAG.getNode(ISD::FP_ROUND, DL, VT, + DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2), + Op2); + } + + SDValue Res; + if (IsStrict) { + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, + DAG.getConstantFP(0, DL, MVT::v4f32), In, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other}, + {Chain, Res, DAG.getTargetConstant(4, DL, MVT::i32)}); + Chain = Res.getValue(1); + } else { + // FIXME: Should we use zeros for upper elements for non-strict? + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In); + Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, + DAG.getTargetConstant(4, DL, MVT::i32)); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getBitcast(MVT::f16, Res); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + + return Res; + } + + return Op; } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -24674,6 +24808,9 @@ MVT VT = Op1.getSimpleValueType(); SDValue CC; + if (VT == MVT::f16 && !Subtarget.hasFP16()) + return SDValue(); + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. @@ -35805,6 +35942,8 @@ case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); + case X86::CMOV_FR16: + case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: @@ -44067,6 +44206,7 @@ // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 && + !(VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget.hasSSE2() || (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -476,6 +476,7 @@ def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; } @@ -508,25 +509,23 @@ def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>; def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>; def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>; } -let Predicates = [HasFP16] in { -def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>; -def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>; -def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>; -} - // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { + def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", + [(set FR16X:$dst, fp16imm0)]>; def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", @@ -535,12 +534,6 @@ [(set VR128X:$dst, fp128imm0)]>; } -let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in { - def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", - [(set FR16X:$dst, fp16imm0)]>; -} - //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // @@ -678,21 +671,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; // Codegen pattern with the alternative types insert VEC128 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; // Codegen pattern with the alternative types insert VEC256 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; multiclass vinsert_for_mask_cast; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; // Codegen pattern with the alternative types extract VEC128 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, @@ -987,14 +980,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; // Codegen pattern with the alternative types extract VEC256 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; // A 128-bit extract from bits [255:128] of a 512-bit vector should use a @@ -1020,6 +1013,10 @@ (v8i16 (VEXTRACTI128rr (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), (iPTR 1)))>; +def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), + (v8f16 (VEXTRACTF128rr + (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), + (iPTR 1)))>; def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), (v16i8 (VEXTRACTI128rr (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), @@ -1049,18 +1046,16 @@ (v8i16 (VEXTRACTI32x4Z256rr (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), (iPTR 1)))>; +def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), + (v8f16 (VEXTRACTF32x4Z256rr + (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), + (iPTR 1)))>; def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), (v16i8 (VEXTRACTI32x4Z256rr (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), (iPTR 1)))>; } -let Predicates = [HasFP16, HasVLX] in -def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), - (v8f16 (VEXTRACTF32x4Z256rr - (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), - (iPTR 1)))>; - // Additional patterns for handling a bitcast between the vselect and the // extract_subvector. @@ -1478,7 +1473,7 @@ Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } -let Predicates = [HasFP16] in { +let Predicates = [HasBWI], AddedComplexity = -10 in { def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)), (VPBROADCASTWZrm addr:$src)>; @@ -1487,7 +1482,7 @@ def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))), (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>; } -let Predicates = [HasVLX, HasFP16] in { +let Predicates = [HasVLX, HasBWI], AddedComplexity = -10 in { def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), (VPBROADCASTWZ128rm addr:$src)>; def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), @@ -3763,6 +3758,9 @@ defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>; defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>; + + defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>; + defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>; } let Predicates = [HasAVX512] in { @@ -3852,7 +3850,7 @@ def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } -let Predicates = [HasFP16] in { +let Predicates = [HasBWI] in { def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))), (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), @@ -3887,7 +3885,7 @@ def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } -let Predicates = [HasFP16, HasVLX] in { +let Predicates = [HasBWI, HasVLX] in { def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), @@ -4099,14 +4097,14 @@ //===----------------------------------------------------------------------===// multiclass avx512_move_scalar prd = [HasAVX512, OptForSize]> { - let Predicates = prd in + X86VectorVTInfo _, Predicate prd = HasAVX512> { + let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))], _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; + let Predicates = [prd] in { def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", @@ -4159,6 +4157,7 @@ !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>, NotMemoryFoldable; + } } defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>, @@ -4168,7 +4167,7 @@ VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info, - [HasFP16]>, + HasFP16>, VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>; multiclass avx512_move_scalar_lowering; } -defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>; defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; -defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; -defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4353,6 +4347,12 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; +let Predicates = [HasFP16] in { +defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>; +defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, (v32i1 (insert_subvector (v32i1 immAllZerosV), @@ -4360,6 +4360,30 @@ (iPTR 0))), (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), GR8, sub_8bit>; + +defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; +defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (insert_subvector + (v32i1 immAllZerosV), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + GR8, sub_8bit>; + +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk + (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)), + VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; + +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; +} + defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4385,10 +4409,6 @@ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; -defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; -defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4396,13 +4416,6 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; -defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, - (v32i1 (insert_subvector - (v32i1 immAllZerosV), - (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), - (iPTR 0))), - (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), - GR8, sub_8bit>; defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4428,16 +4441,6 @@ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; -def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))), - (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk - (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)), - VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), - (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; - -def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)), - (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), - (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; - def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)), @@ -11651,6 +11654,13 @@ defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +let Predicates = [HasBWI], AddedComplexity = -10 in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>; + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>; +} + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations //===----------------------------------------------------------------------===// @@ -12988,7 +12998,6 @@ sub_16bit))>; def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))), (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>; -} // Allow "vmovw" to use GR64 let hasSideEffects = 0 in { @@ -12997,6 +13006,7 @@ def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>; } +} // Convert 16-bit float to i16/u16 multiclass avx512_cvtph2w opc, string OpcodeStr, SDPatternOperator OpNode, diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -562,12 +562,14 @@ let Predicates = [HasMMX] in defm _VR64 : CMOVrr_PSEUDO; - defm _FR16X : CMOVrr_PSEUDO; let Predicates = [HasSSE1,NoAVX512] in defm _FR32 : CMOVrr_PSEUDO; - let Predicates = [HasSSE2,NoAVX512] in + let Predicates = [HasSSE2,NoAVX512] in { + defm _FR16 : CMOVrr_PSEUDO; defm _FR64 : CMOVrr_PSEUDO; + } let Predicates = [HasAVX512] in { + defm _FR16X : CMOVrr_PSEUDO; defm _FR32X : CMOVrr_PSEUDO; defm _FR64X : CMOVrr_PSEUDO; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -762,6 +762,7 @@ case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: + case X86::FsFLD0SH: case X86::FsFLD0F128: case X86::KSET0D: case X86::KSET0Q: @@ -3482,10 +3483,6 @@ case 2: if (X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; - if (X86::FR16XRegClass.hasSubClassEq(RC)) { - assert(STI.hasFP16()); - return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; - } assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: @@ -3513,6 +3510,10 @@ X86::VK8PAIRRegClass.hasSubClassEq(RC) || X86::VK16PAIRRegClass.hasSubClassEq(RC)) return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE; + if ((X86::FR16RegClass.hasSubClassEq(RC) || + X86::FR16XRegClass.hasSubClassEq(RC)) && + STI.hasFP16()) + return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) @@ -3752,12 +3753,12 @@ const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILESTORED; // tilestored %tmm, (%sp, %idx) - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); MachineInstr *NewMI = @@ -3766,6 +3767,14 @@ MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr + : Subtarget.hasAVX() ? X86::VMOVSSmr + : X86::MOVSSmr; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3794,6 +3803,14 @@ MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if ((RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) && + !Subtarget.hasFP16()) { + unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm + : Subtarget.hasAVX() ? X86::VMOVSSrm + : X86::MOVSSrm; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -4752,6 +4769,7 @@ case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0SH: case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { @@ -6487,6 +6505,7 @@ case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: Alignment = Align(2); break; @@ -6525,6 +6544,7 @@ case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: + case X86::FsFLD0SH: case X86::AVX512_FsFLD0SH: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: @@ -6564,7 +6584,7 @@ Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); - else if (Opc == X86::AVX512_FsFLD0SH) + else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH) Ty = Type::getHalfTy(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -112,6 +112,8 @@ // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", + [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", @@ -3965,6 +3967,19 @@ } // ExeDomain = SSEPackedInt +let Predicates = [UseSSE2], AddedComplexity = -10 in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + +let Predicates = [HasAVX, NoBWI], AddedComplexity = -10 in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Mask Creation //===---------------------------------------------------------------------===// @@ -5193,6 +5208,12 @@ defm PEXTRW : SS41I_extract16<0x15, "pextrw">; +let Predicates = [UseSSE41] in + def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + +let Predicates = [HasAVX, NoBWI] in + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract32 opc, string OpcodeStr> { diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -17,6 +17,8 @@ let Predicates = [NoAVX512] in { // A vector extract of the first f32/f64 position is a subregister copy + def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>; def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), @@ -34,8 +36,8 @@ } let Predicates = [NoVLX] in { - def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), - (COPY_TO_REGCLASS FR16X:$src, VR128)>; + def : Pat<(v8f16 (scalar_to_vector FR16:$src)), + (COPY_TO_REGCLASS FR16:$src, VR128)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (COPY_TO_REGCLASS FR32:$src, VR128)>; diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp --- a/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -179,6 +179,8 @@ return &X86::GR64RegClass; } if (RB.getID() == X86::VECRRegBankID) { + if (Ty.getSizeInBits() == 16) + return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; if (Ty.getSizeInBits() == 32) return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; if (Ty.getSizeInBits() == 64) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -537,6 +537,8 @@ def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;} + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -599,7 +601,7 @@ def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; -def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>; +def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;} // Extended VR128 and VR256 for AVX-512 instructions def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], diff --git a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll --- a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll +++ b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll @@ -847,228 +847,228 @@ define void @fp16() { ; SSE2-LABEL: 'fp16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 229 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 234 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 214 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 232 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'fp16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'fp16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'fp16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'fp16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) @@ -1112,56 +1112,56 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'fp16' -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) diff --git a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir --- a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir +++ b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir @@ -28,8 +28,8 @@ liveins: $rdi, $rsi ; CHECK-LABEL: name: test - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, - INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, + INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RET64 killed $rax ... @@ -45,8 +45,8 @@ ; Verify that the register ties are preserved. ; CHECK-LABEL: name: test2 - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags - INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RET64 killed $rax ... diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -4,9 +4,9 @@ ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOSSE -; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64,X64-SSE -; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64-SSE +; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX ; Note: This test is testing that the lowering for atomics matches what we ; currently emit for non-atomics + the atomic restriction. The presence of @@ -16,17 +16,45 @@ ; and their calling convention which remain unresolved.) define void @store_half(half* %fptr, half %v) { -; X86-LABEL: store_half: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movw %ax, (%ecx) -; X86-NEXT: retl +; X86-SSE1-LABEL: store_half: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movw %ax, (%ecx) +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: store_half: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movw %cx, (%eax) +; X86-SSE2-NEXT: retl ; -; X64-LABEL: store_half: -; X64: # %bb.0: -; X64-NEXT: movw %si, (%rdi) -; X64-NEXT: retq +; X86-AVX-LABEL: store_half: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movw %cx, (%eax) +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: store_half: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movw %ax, (%ecx) +; X86-NOSSE-NEXT: retl +; +; X64-SSE-LABEL: store_half: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pextrw $0, %xmm0, %eax +; X64-SSE-NEXT: movw %ax, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: store_half: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpextrw $0, %xmm0, %eax +; X64-AVX-NEXT: movw %ax, (%rdi) +; X64-AVX-NEXT: retq store atomic half %v, half* %fptr unordered, align 2 ret void } @@ -193,16 +221,43 @@ } define half @load_half(half* %fptr) { -; X86-LABEL: load_half: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: retl +; X86-SSE1-LABEL: load_half: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movzwl (%eax), %eax +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_half: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl (%eax), %eax +; X86-SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: load_half: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl (%eax), %eax +; X86-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: load_half: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movzwl (%eax), %eax +; X86-NOSSE-NEXT: retl ; -; X64-LABEL: load_half: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: retq +; X64-SSE-LABEL: load_half: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movzwl (%rdi), %eax +; X64-SSE-NEXT: pinsrw $0, %eax, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: load_half: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movzwl (%rdi), %eax +; X64-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %v = load atomic half, half* %fptr unordered, align 2 ret half %v } @@ -511,8 +566,6 @@ } -; Check the seq_cst lowering since that's the -; interesting one from an ordering perspective on x86. define void @store_float_seq_cst(float* %fptr, float %v) { ; X86-LABEL: store_float_seq_cst: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -2251,22 +2251,20 @@ ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $1, %k2, %k2 ; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: testb $1, %cl -; KNL-NEXT: movl $0, %ecx -; KNL-NEXT: je LBB85_2 -; KNL-NEXT: ## %bb.1: -; KNL-NEXT: movzwl 2(%rsi), %ecx -; KNL-NEXT: LBB85_2: -; KNL-NEXT: kmovw %k0, %edi -; KNL-NEXT: testb $1, %dil -; KNL-NEXT: je LBB85_4 -; KNL-NEXT: ## %bb.3: +; KNL-NEXT: kandw %k1, %k0, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k2 ; KNL-NEXT: movzwl (%rsi), %eax -; KNL-NEXT: LBB85_4: +; KNL-NEXT: movzwl 2(%rsi), %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 +; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k2} {z} +; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, %ecx +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 +; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: movw %ax, (%rdx) ; KNL-NEXT: movw %cx, 2(%rdx) ; KNL-NEXT: retq @@ -2301,22 +2299,20 @@ ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $7, %k2, %k2 ; SKX-NEXT: korw %k1, %k2, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: xorl %eax, %eax -; SKX-NEXT: testb $1, %cl -; SKX-NEXT: movl $0, %ecx -; SKX-NEXT: je LBB85_2 -; SKX-NEXT: ## %bb.1: -; SKX-NEXT: movzwl 2(%rsi), %ecx -; SKX-NEXT: LBB85_2: -; SKX-NEXT: kmovd %k0, %edi -; SKX-NEXT: testb $1, %dil -; SKX-NEXT: je LBB85_4 -; SKX-NEXT: ## %bb.3: +; SKX-NEXT: kandw %k1, %k0, %k1 +; SKX-NEXT: kshiftrb $1, %k1, %k2 ; SKX-NEXT: movzwl (%rsi), %eax -; SKX-NEXT: LBB85_4: +; SKX-NEXT: movzwl 2(%rsi), %ecx +; SKX-NEXT: vmovd %ecx, %xmm0 +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k2} {z} +; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, %ecx +; SKX-NEXT: vmovd %eax, %xmm0 +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: movw %ax, (%rdx) ; SKX-NEXT: movw %cx, 2(%rdx) ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll --- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -156,203 +156,153 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { ; CHECK-LABEL: test_mask_load_16xf16: ; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovmskb %xmm0, %r11d -; CHECK-NEXT: testb $1, %r11b +; CHECK-NEXT: vpmovmskb %xmm0, %ecx +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: je LBB12_1 ; CHECK-NEXT: ## %bb.2: ## %cond.load -; CHECK-NEXT: movzwl (%rsi), %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm8 ; CHECK-NEXT: jmp LBB12_3 ; CHECK-NEXT: LBB12_1: -; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: vpxor %xmm8, %xmm8, %xmm8 ; CHECK-NEXT: LBB12_3: ## %else -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: testb $2, %r11b +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; CHECK-NEXT: vmovdqa %xmm2, %xmm10 +; CHECK-NEXT: vmovdqa %xmm2, %xmm4 +; CHECK-NEXT: vmovdqa %xmm2, %xmm5 +; CHECK-NEXT: vmovdqa %xmm2, %xmm6 +; CHECK-NEXT: vmovdqa %xmm2, %xmm7 +; CHECK-NEXT: vmovdqa %xmm2, %xmm1 +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vmovdqa %xmm2, %xmm3 +; CHECK-NEXT: vmovdqa %xmm2, %xmm11 +; CHECK-NEXT: vmovdqa %xmm2, %xmm12 +; CHECK-NEXT: vmovdqa %xmm2, %xmm13 +; CHECK-NEXT: vmovdqa %xmm2, %xmm14 +; CHECK-NEXT: testb $2, %cl ; CHECK-NEXT: je LBB12_4 ; CHECK-NEXT: ## %bb.5: ## %cond.load1 -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movl %edi, %r12d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %edi, %r13d -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movl %edi, %r8d -; CHECK-NEXT: movl %edi, %r9d -; CHECK-NEXT: movl %edi, %r10d -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movzwl 2(%rsi), %edi -; CHECK-NEXT: ## kill: def $di killed $di def $edi -; CHECK-NEXT: testb $4, %r11b +; CHECK-NEXT: vmovdqa %xmm2, %xmm15 +; CHECK-NEXT: vpinsrw $0, 2(%rsi), %xmm0, %xmm2 +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: jne LBB12_7 ; CHECK-NEXT: jmp LBB12_8 ; CHECK-NEXT: LBB12_4: -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movl %edi, %r12d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %edi, %r13d -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movl %edi, %r8d -; CHECK-NEXT: movl %edi, %r9d -; CHECK-NEXT: movl %edi, %r10d -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testb $4, %r11b +; CHECK-NEXT: vmovdqa %xmm2, %xmm15 +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: je LBB12_8 ; CHECK-NEXT: LBB12_7: ## %cond.load4 -; CHECK-NEXT: movzwl 4(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: vpinsrw $0, 4(%rsi), %xmm0, %xmm10 ; CHECK-NEXT: LBB12_8: ## %else5 -; CHECK-NEXT: testb $8, %r11b +; CHECK-NEXT: testb $8, %cl ; CHECK-NEXT: jne LBB12_9 ; CHECK-NEXT: ## %bb.10: ## %else8 -; CHECK-NEXT: testb $16, %r11b +; CHECK-NEXT: testb $16, %cl ; CHECK-NEXT: jne LBB12_11 ; CHECK-NEXT: LBB12_12: ## %else11 -; CHECK-NEXT: testb $32, %r11b +; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: jne LBB12_13 ; CHECK-NEXT: LBB12_14: ## %else14 -; CHECK-NEXT: testb $64, %r11b +; CHECK-NEXT: testb $64, %cl ; CHECK-NEXT: jne LBB12_15 ; CHECK-NEXT: LBB12_16: ## %else17 -; CHECK-NEXT: testb $-128, %r11b +; CHECK-NEXT: testb $-128, %cl ; CHECK-NEXT: jne LBB12_17 ; CHECK-NEXT: LBB12_18: ## %else20 -; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 ; CHECK-NEXT: jne LBB12_19 ; CHECK-NEXT: LBB12_20: ## %else23 -; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 ; CHECK-NEXT: jne LBB12_21 ; CHECK-NEXT: LBB12_22: ## %else26 -; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 ; CHECK-NEXT: jne LBB12_23 ; CHECK-NEXT: LBB12_24: ## %else29 -; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 ; CHECK-NEXT: jne LBB12_25 ; CHECK-NEXT: LBB12_26: ## %else32 -; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 -; CHECK-NEXT: je LBB12_28 -; CHECK-NEXT: LBB12_27: ## %cond.load34 -; CHECK-NEXT: movzwl 24(%rsi), %edx +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: jne LBB12_27 ; CHECK-NEXT: LBB12_28: ## %else35 -; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $8192, %r11d ## imm = 0x2000 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 ; CHECK-NEXT: jne LBB12_29 -; CHECK-NEXT: ## %bb.30: ## %else38 -; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 +; CHECK-NEXT: LBB12_30: ## %else38 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 ; CHECK-NEXT: jne LBB12_31 ; CHECK-NEXT: LBB12_32: ## %else41 -; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 -; CHECK-NEXT: je LBB12_33 -; CHECK-NEXT: LBB12_34: ## %cond.load43 -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movzwl 30(%rsi), %esi -; CHECK-NEXT: jmp LBB12_35 +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 +; CHECK-NEXT: je LBB12_34 +; CHECK-NEXT: LBB12_33: ## %cond.load43 +; CHECK-NEXT: vpinsrw $0, 30(%rsi), %xmm0, %xmm9 +; CHECK-NEXT: LBB12_34: ## %else44 +; CHECK-NEXT: vpextrw $0, %xmm8, (%rax) +; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rax) +; CHECK-NEXT: vpextrw $0, %xmm10, 4(%rax) +; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rax) +; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rax) +; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rax) +; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rax) +; CHECK-NEXT: vpextrw $0, %xmm1, 14(%rax) +; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rax) +; CHECK-NEXT: vpextrw $0, %xmm3, 18(%rax) +; CHECK-NEXT: vpextrw $0, %xmm11, 20(%rax) +; CHECK-NEXT: vpextrw $0, %xmm12, 22(%rax) +; CHECK-NEXT: vpextrw $0, %xmm13, 24(%rax) +; CHECK-NEXT: vpextrw $0, %xmm14, 26(%rax) +; CHECK-NEXT: vpextrw $0, %xmm15, 28(%rax) +; CHECK-NEXT: vpextrw $0, %xmm9, 30(%rax) +; CHECK-NEXT: retq ; CHECK-NEXT: LBB12_9: ## %cond.load7 -; CHECK-NEXT: movzwl 6(%rsi), %r12d -; CHECK-NEXT: testb $16, %r11b +; CHECK-NEXT: vpinsrw $0, 6(%rsi), %xmm0, %xmm4 +; CHECK-NEXT: testb $16, %cl ; CHECK-NEXT: je LBB12_12 ; CHECK-NEXT: LBB12_11: ## %cond.load10 -; CHECK-NEXT: movzwl 8(%rsi), %ebx -; CHECK-NEXT: testb $32, %r11b +; CHECK-NEXT: vpinsrw $0, 8(%rsi), %xmm0, %xmm5 +; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: je LBB12_14 ; CHECK-NEXT: LBB12_13: ## %cond.load13 -; CHECK-NEXT: movzwl 10(%rsi), %ebp -; CHECK-NEXT: testb $64, %r11b +; CHECK-NEXT: vpinsrw $0, 10(%rsi), %xmm0, %xmm6 +; CHECK-NEXT: testb $64, %cl ; CHECK-NEXT: je LBB12_16 ; CHECK-NEXT: LBB12_15: ## %cond.load16 -; CHECK-NEXT: movzwl 12(%rsi), %r13d -; CHECK-NEXT: testb $-128, %r11b +; CHECK-NEXT: vpinsrw $0, 12(%rsi), %xmm0, %xmm7 +; CHECK-NEXT: testb $-128, %cl ; CHECK-NEXT: je LBB12_18 ; CHECK-NEXT: LBB12_17: ## %cond.load19 -; CHECK-NEXT: movzwl 14(%rsi), %r14d -; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 +; CHECK-NEXT: vpinsrw $0, 14(%rsi), %xmm0, %xmm1 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 ; CHECK-NEXT: je LBB12_20 ; CHECK-NEXT: LBB12_19: ## %cond.load22 -; CHECK-NEXT: movzwl 16(%rsi), %r8d -; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 +; CHECK-NEXT: vpinsrw $0, 16(%rsi), %xmm0, %xmm0 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 ; CHECK-NEXT: je LBB12_22 ; CHECK-NEXT: LBB12_21: ## %cond.load25 -; CHECK-NEXT: movzwl 18(%rsi), %r9d -; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 +; CHECK-NEXT: vpinsrw $0, 18(%rsi), %xmm0, %xmm3 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 ; CHECK-NEXT: je LBB12_24 ; CHECK-NEXT: LBB12_23: ## %cond.load28 -; CHECK-NEXT: movzwl 20(%rsi), %r10d -; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 +; CHECK-NEXT: vpinsrw $0, 20(%rsi), %xmm0, %xmm11 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 ; CHECK-NEXT: je LBB12_26 ; CHECK-NEXT: LBB12_25: ## %cond.load31 -; CHECK-NEXT: movzwl 22(%rsi), %r15d -; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 -; CHECK-NEXT: jne LBB12_27 -; CHECK-NEXT: jmp LBB12_28 +; CHECK-NEXT: vpinsrw $0, 22(%rsi), %xmm0, %xmm12 +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: je LBB12_28 +; CHECK-NEXT: LBB12_27: ## %cond.load34 +; CHECK-NEXT: vpinsrw $0, 24(%rsi), %xmm0, %xmm13 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 +; CHECK-NEXT: je LBB12_30 ; CHECK-NEXT: LBB12_29: ## %cond.load37 -; CHECK-NEXT: movzwl 26(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 +; CHECK-NEXT: vpinsrw $0, 26(%rsi), %xmm0, %xmm14 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 ; CHECK-NEXT: je LBB12_32 ; CHECK-NEXT: LBB12_31: ## %cond.load40 -; CHECK-NEXT: movzwl 28(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 -; CHECK-NEXT: jne LBB12_34 -; CHECK-NEXT: LBB12_33: -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: LBB12_35: ## %else44 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload -; CHECK-NEXT: movw %dx, (%rax) -; CHECK-NEXT: movw %di, 2(%rax) -; CHECK-NEXT: movw %cx, 4(%rax) -; CHECK-NEXT: movw %r12w, 6(%rax) -; CHECK-NEXT: movw %bx, 8(%rax) -; CHECK-NEXT: movw %bp, 10(%rax) -; CHECK-NEXT: movw %r13w, 12(%rax) -; CHECK-NEXT: movw %r14w, 14(%rax) -; CHECK-NEXT: movw %r8w, 16(%rax) -; CHECK-NEXT: movw %r9w, 18(%rax) -; CHECK-NEXT: movw %r10w, 20(%rax) -; CHECK-NEXT: movw %r15w, 22(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 24(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 26(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 28(%rax) -; CHECK-NEXT: movw %si, 30(%rax) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq +; CHECK-NEXT: vpinsrw $0, 28(%rsi), %xmm0, %xmm15 +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 +; CHECK-NEXT: jne LBB12_33 +; CHECK-NEXT: jmp LBB12_34 %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer) ret <16 x half> %res } @@ -414,78 +364,76 @@ ; CHECK-NEXT: LBB13_32: ## %else30 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB13_1: ## %cond.store -; CHECK-NEXT: movw %si, (%rdi) +; CHECK-NEXT: vpextrw $0, %xmm1, (%rdi) ; CHECK-NEXT: testb $2, %al ; CHECK-NEXT: je LBB13_4 ; CHECK-NEXT: LBB13_3: ## %cond.store1 -; CHECK-NEXT: movw %dx, 2(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rdi) ; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: je LBB13_6 ; CHECK-NEXT: LBB13_5: ## %cond.store3 -; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm3, 4(%rdi) ; CHECK-NEXT: testb $8, %al ; CHECK-NEXT: je LBB13_8 ; CHECK-NEXT: LBB13_7: ## %cond.store5 -; CHECK-NEXT: movw %r8w, 6(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rdi) ; CHECK-NEXT: testb $16, %al ; CHECK-NEXT: je LBB13_10 ; CHECK-NEXT: LBB13_9: ## %cond.store7 -; CHECK-NEXT: movw %r9w, 8(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rdi) ; CHECK-NEXT: testb $32, %al ; CHECK-NEXT: je LBB13_12 ; CHECK-NEXT: LBB13_11: ## %cond.store9 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 10(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rdi) ; CHECK-NEXT: testb $64, %al ; CHECK-NEXT: je LBB13_14 ; CHECK-NEXT: LBB13_13: ## %cond.store11 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 12(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rdi) ; CHECK-NEXT: testb $-128, %al ; CHECK-NEXT: je LBB13_16 ; CHECK-NEXT: LBB13_15: ## %cond.store13 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 14(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 14(%rdi) ; CHECK-NEXT: testl $256, %eax ## imm = 0x100 ; CHECK-NEXT: je LBB13_18 ; CHECK-NEXT: LBB13_17: ## %cond.store15 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 16(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rdi) ; CHECK-NEXT: testl $512, %eax ## imm = 0x200 ; CHECK-NEXT: je LBB13_20 ; CHECK-NEXT: LBB13_19: ## %cond.store17 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 18(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 18(%rdi) ; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 ; CHECK-NEXT: je LBB13_22 ; CHECK-NEXT: LBB13_21: ## %cond.store19 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 20(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 20(%rdi) ; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 ; CHECK-NEXT: je LBB13_24 ; CHECK-NEXT: LBB13_23: ## %cond.store21 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 22(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 22(%rdi) ; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 ; CHECK-NEXT: je LBB13_26 ; CHECK-NEXT: LBB13_25: ## %cond.store23 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 24(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 24(%rdi) ; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 ; CHECK-NEXT: je LBB13_28 ; CHECK-NEXT: LBB13_27: ## %cond.store25 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 26(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 26(%rdi) ; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 ; CHECK-NEXT: je LBB13_30 ; CHECK-NEXT: LBB13_29: ## %cond.store27 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 28(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 28(%rdi) ; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 ; CHECK-NEXT: je LBB13_32 ; CHECK-NEXT: LBB13_31: ## %cond.store29 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movw %ax, 30(%rdi) +; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 30(%rdi) ; CHECK-NEXT: retq call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) ret void diff --git a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll --- a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %2 ; CHECK-NEXT: t8: i32 = add t2, Constant:i32<4> ; CHECK-NEXT: t22: ch,glue = CopyToReg t17, Register:i32 %5, t8 -; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch, TargetConstant:i64<8>, TargetConstant:i32<2293769>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1 +; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch, TargetConstant:i64<8>, TargetConstant:i32<2359305>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1 define i32 @test(i32 %a, i32 %b, i32 %c) { entry: diff --git a/llvm/test/CodeGen/X86/cvt16-2.ll b/llvm/test/CodeGen/X86/cvt16-2.ll --- a/llvm/test/CodeGen/X86/cvt16-2.ll +++ b/llvm/test/CodeGen/X86/cvt16-2.ll @@ -10,6 +10,7 @@ ; LIBCALL-NEXT: .cfi_offset %rbx, -16 ; LIBCALL-NEXT: movq %rdi, %rbx ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; LIBCALL-NEXT: movw %ax, (%rbx) ; LIBCALL-NEXT: popq %rbx ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 @@ -28,7 +29,7 @@ define float @test2(i16* nocapture %src) { ; LIBCALL-LABEL: test2: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; FP16-LABEL: test2: @@ -47,7 +48,6 @@ ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; LIBCALL-NEXT: movzwl %ax, %edi ; LIBCALL-NEXT: popq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL @@ -68,7 +68,7 @@ ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 -; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 ; LIBCALL-NEXT: popq %rax @@ -88,7 +88,14 @@ define i16 @test5(double %src) { ; LIBCALL-LABEL: test5: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: callq __truncdfhf2@PLT +; LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax +; LIBCALL-NEXT: popq %rcx +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq ; ; FP16-LABEL: test5: ; FP16: # %bb.0: @@ -106,10 +113,8 @@ ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 -; LIBCALL-NEXT: movzwl (%rdi), %edi -; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) -; LIBCALL-NEXT: flds {{[0-9]+}}(%rsp) +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; LIBCALL-NEXT: callq __extendhfxf2@PLT ; LIBCALL-NEXT: popq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq @@ -131,7 +136,16 @@ define i16 @test7(x86_fp80 %src) { ; LIBCALL-LABEL: test7: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: jmp __truncxfhf2@PLT # TAILCALL +; LIBCALL-NEXT: subq $24, %rsp +; LIBCALL-NEXT: .cfi_def_cfa_offset 32 +; LIBCALL-NEXT: fldt {{[0-9]+}}(%rsp) +; LIBCALL-NEXT: fstpt (%rsp) +; LIBCALL-NEXT: callq __truncxfhf2@PLT +; LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax +; LIBCALL-NEXT: addq $24, %rsp +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq ; ; FP16-LABEL: test7: ; FP16: # %bb.0: diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -29,7 +29,7 @@ ; LIBCALL-NEXT: .cfi_offset %rbx, -16 ; LIBCALL-NEXT: movq %rdi, %rbx ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; LIBCALL-NEXT: movw %ax, (%rbx) +; LIBCALL-NEXT: pextrw $0, %xmm0, (%rbx) ; LIBCALL-NEXT: popq %rbx ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq @@ -37,7 +37,8 @@ ; F16C-LABEL: test1: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, (%rdi) +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rdi) ; F16C-NEXT: retq ; ; SOFTFLOAT-LABEL: test1: @@ -59,7 +60,7 @@ define float @test2(i16* nocapture %src) { ; LIBCALL-LABEL: test2: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; F16C-LABEL: test2: @@ -89,7 +90,6 @@ ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; LIBCALL-NEXT: movzwl %ax, %edi ; LIBCALL-NEXT: popq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL @@ -97,6 +97,9 @@ ; F16C-LABEL: test3: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: retq ; @@ -120,7 +123,7 @@ ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 -; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 ; LIBCALL-NEXT: popq %rax @@ -154,11 +157,22 @@ define i16 @test5(double %src) { ; LIBCALL-LABEL: test5: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: callq __truncdfhf2@PLT +; LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax +; LIBCALL-NEXT: popq %rcx +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq ; ; F16C-LABEL: test5: ; F16C: # %bb.0: -; F16C-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: retq ; ; SOFTFLOAT-LABEL: test5: ; SOFTFLOAT: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll --- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll +++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=F16C +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX define zeroext i16 @test1_fast(double %d) #0 { ; F16C-LABEL: test1_fast: @@ -16,6 +16,8 @@ ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 ; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -42,6 +44,8 @@ ; AVX-NEXT: fldt {{[0-9]+}}(%rsp) ; AVX-NEXT: fstpt (%rsp) ; AVX-NEXT: callq __truncxfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: addq $24, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -51,30 +55,52 @@ } define zeroext i16 @test1(double %d) #1 { -; ALL-LABEL: test1: -; ALL: # %bb.0: # %entry -; ALL-NEXT: pushq %rax -; ALL-NEXT: .cfi_def_cfa_offset 16 -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: popq %rcx -; ALL-NEXT: .cfi_def_cfa_offset 8 -; ALL-NEXT: retq +; F16C-LABEL: test1: +; F16C: # %bb.0: # %entry +; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: retq +; +; AVX-LABEL: test1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d) ret i16 %0 } define zeroext i16 @test2(x86_fp80 %d) #1 { -; ALL-LABEL: test2: -; ALL: # %bb.0: # %entry -; ALL-NEXT: subq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 32 -; ALL-NEXT: fldt {{[0-9]+}}(%rsp) -; ALL-NEXT: fstpt (%rsp) -; ALL-NEXT: callq __truncxfhf2@PLT -; ALL-NEXT: addq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 8 -; ALL-NEXT: retq +; F16C-LABEL: test2: +; F16C: # %bb.0: # %entry +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: retq +; +; AVX-LABEL: test2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fstpt (%rsp) +; AVX-NEXT: callq __truncxfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d) ret i16 %0 diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -111,11 +111,9 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movzwl %ax, %edi ; X64-NEXT: popq %rax ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -10,18 +10,16 @@ ; SSE2: ## %bb.0: ## %entry ; SSE2-NEXT: pushq %rax ; SSE2-NEXT: .cfi_def_cfa_offset 16 -; SSE2-NEXT: movzwl %di, %edi ; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: callq ___truncsfhf2 -; SSE2-NEXT: popq %rcx +; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: round_f16: ; SSE41: ## %bb.0: ## %entry ; SSE41-NEXT: pushq %rax ; SSE41-NEXT: .cfi_def_cfa_offset 16 -; SSE41-NEXT: movzwl %di, %edi ; SSE41-NEXT: callq ___extendhfsf2 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; SSE41-NEXT: andps %xmm0, %xmm1 @@ -30,14 +28,13 @@ ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: roundss $11, %xmm1, %xmm0 ; SSE41-NEXT: callq ___truncsfhf2 -; SSE41-NEXT: popq %rcx +; SSE41-NEXT: popq %rax ; SSE41-NEXT: retq ; ; AVX1-LABEL: round_f16: ; AVX1: ## %bb.0: ## %entry ; AVX1-NEXT: pushq %rax ; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: movzwl %di, %edi ; AVX1-NEXT: callq ___extendhfsf2 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] @@ -45,12 +42,13 @@ ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: callq ___truncsfhf2 -; AVX1-NEXT: popq %rcx +; AVX1-NEXT: popq %rax ; AVX1-NEXT: retq ; ; AVX512F-LABEL: round_f16: ; AVX512F: ## %bb.0: ## %entry -; AVX512F-NEXT: movzwl %di, %eax +; AVX512F-NEXT: vpextrw $0, %xmm0, %eax +; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] @@ -59,7 +57,7 @@ ; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512FP16-LABEL: round_f16: diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -10,44 +10,42 @@ ; SSE2: ## %bb.0: ## %entry ; SSE2-NEXT: pushq %rax ; SSE2-NEXT: .cfi_def_cfa_offset 16 -; SSE2-NEXT: movzwl %di, %edi ; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: callq ___truncsfhf2 -; SSE2-NEXT: popq %rcx +; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: roundeven_f16: ; SSE41: ## %bb.0: ## %entry ; SSE41-NEXT: pushq %rax ; SSE41-NEXT: .cfi_def_cfa_offset 16 -; SSE41-NEXT: movzwl %di, %edi ; SSE41-NEXT: callq ___extendhfsf2 ; SSE41-NEXT: roundss $8, %xmm0, %xmm0 ; SSE41-NEXT: callq ___truncsfhf2 -; SSE41-NEXT: popq %rcx +; SSE41-NEXT: popq %rax ; SSE41-NEXT: retq ; ; AVX1-LABEL: roundeven_f16: ; AVX1: ## %bb.0: ## %entry ; AVX1-NEXT: pushq %rax ; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: movzwl %di, %edi ; AVX1-NEXT: callq ___extendhfsf2 ; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: callq ___truncsfhf2 -; AVX1-NEXT: popq %rcx +; AVX1-NEXT: popq %rax ; AVX1-NEXT: retq ; ; AVX512F-LABEL: roundeven_f16: ; AVX512F: ## %bb.0: ## %entry -; AVX512F-NEXT: movzwl %di, %eax +; AVX512F-NEXT: vpextrw $0, %xmm0, %eax +; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512FP16-LABEL: roundeven_f16: diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -19,10 +19,9 @@ ; X64-SSE-LABEL: TestFPExtF16_F128: ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax -; X64-SSE-NEXT: movzwl vf16(%rip), %edi -; X64-SSE-NEXT: callq __gnu_h2f_ieee@PLT -; X64-SSE-NEXT: callq __extendsftf2@PLT -; X64-SSE-NEXT: movaps %xmm0, vf128(%rip) +; X64-SSE-NEXT: pinsrw $0, vf16(%rip), %xmm0 +; X64-SSE-NEXT: callq __extendhftf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, vf128(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; @@ -218,8 +217,9 @@ ; X64-SSE-LABEL: TestFPTruncF128_F16: ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax -; X64-SSE-NEXT: movaps vf128(%rip), %xmm0 +; X64-SSE-NEXT: movdqa vf128(%rip), %xmm0 ; X64-SSE-NEXT: callq __trunctfhf2@PLT +; X64-SSE-NEXT: pextrw $0, %xmm0, %eax ; X64-SSE-NEXT: movw %ax, vf16(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll --- a/llvm/test/CodeGen/X86/fpclamptosat.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat.ll @@ -139,15 +139,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; CHECK-NEXT: cmovael %eax, %ecx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF +; CHECK-NEXT: cmovbel %ecx, %edx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovnpl %ecx, %eax +; CHECK-NEXT: cmovnpl %edx, %eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -166,19 +168,12 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: sarq $63, %rcx -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovbl %edx, %eax +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovael %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -195,12 +190,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; CHECK-NEXT: cmpq %rax, %rcx -; CHECK-NEXT: cmovlq %rcx, %rax +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF +; CHECK-NEXT: cmpq %rcx, %rax +; CHECK-NEXT: cmovgeq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: cmovlel %ecx, %eax @@ -343,13 +336,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 +; CHECK-NEXT: cmovael %eax, %ecx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32767, %edx # imm = 0x7FFF +; CHECK-NEXT: cmovbel %ecx, %edx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpl %edx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -369,12 +366,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF -; CHECK-NEXT: cmovbl %ecx, %eax +; CHECK-NEXT: callq __fixhfsi@PLT +; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF +; CHECK-NEXT: cmovael %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -392,15 +387,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF -; CHECK-NEXT: cmovll %eax, %ecx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovgel %ecx, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: cmovlel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -562,15 +555,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; CHECK-NEXT: cmovaeq %rax, %rcx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rcx, %rdx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovnpq %rcx, %rax +; CHECK-NEXT: cmovnpq %rdx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -589,9 +584,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: cmovneq %rcx, %rax @@ -611,9 +604,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movl $1, %esi @@ -768,15 +759,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; CHECK-NEXT: cmovael %eax, %ecx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF +; CHECK-NEXT: cmovbel %ecx, %edx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovnpl %ecx, %eax +; CHECK-NEXT: cmovnpl %edx, %eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -793,15 +786,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: andq %rdx, %rax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: cmovaeq %rcx, %rax @@ -821,15 +806,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: cmovlq %rax, %rcx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testq %rcx, %rcx -; CHECK-NEXT: cmovgq %rcx, %rax +; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovleq %rcx, %rax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -957,13 +940,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 +; CHECK-NEXT: cmovael %eax, %ecx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32767, %edx # imm = 0x7FFF +; CHECK-NEXT: cmovbel %ecx, %edx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpl %edx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -981,12 +968,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF -; CHECK-NEXT: cmovbl %ecx, %eax +; CHECK-NEXT: callq __fixhfsi@PLT +; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF +; CHECK-NEXT: cmovael %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1003,15 +988,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF -; CHECK-NEXT: cmovll %eax, %ecx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovgel %ecx, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: cmovlel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -1163,15 +1146,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; CHECK-NEXT: cmovaeq %rax, %rcx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rcx, %rdx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovnpq %rcx, %rax +; CHECK-NEXT: cmovnpq %rdx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -1188,9 +1173,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: cmovneq %rcx, %rax @@ -1211,9 +1194,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movl $1, %esi diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -395,40 +395,30 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-LABEL: stest_f16i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] @@ -448,7 +438,7 @@ ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pandn %xmm2, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 -; CHECK-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 ; CHECK-NEXT: movdqa %xmm4, %xmm5 @@ -491,13 +481,7 @@ ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -513,65 +497,31 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-LABEL: utesth_f16i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %ecx, %ebp -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl %si, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] @@ -590,7 +540,7 @@ ; CHECK-NEXT: pand %xmm5, %xmm0 ; CHECK-NEXT: pandn %xmm1, %xmm5 ; CHECK-NEXT: por %xmm0, %xmm5 -; CHECK-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; CHECK-NEXT: pxor %xmm6, %xmm2 ; CHECK-NEXT: movdqa %xmm4, %xmm0 ; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 @@ -604,13 +554,7 @@ ; CHECK-NEXT: pandn %xmm1, %xmm0 ; CHECK-NEXT: por %xmm6, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -624,40 +568,30 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-LABEL: ustest_f16i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm3 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] @@ -677,7 +611,7 @@ ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pandn %xmm2, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 -; CHECK-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 ; CHECK-NEXT: movdqa %xmm4, %xmm5 @@ -713,13 +647,7 @@ ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -891,98 +819,67 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -998,79 +895,60 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] @@ -1097,19 +975,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm4, %xmm0 -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1123,79 +989,60 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %r14d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r13d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r12d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] @@ -1225,19 +1072,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm3, %xmm0 -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1567,24 +1402,21 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) { ; CHECK-LABEL: stest_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %r14d -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: callq __fixhfti@PLT +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmpq %rsi, %rax @@ -1592,28 +1424,28 @@ ; CHECK-NEXT: sbbq $0, %rdi ; CHECK-NEXT: cmovgeq %rcx, %rdx ; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: cmpq %rsi, %rbx -; CHECK-NEXT: movq %rbp, %rdi +; CHECK-NEXT: cmpq %rsi, %r14 +; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rbp, %rcx -; CHECK-NEXT: cmovlq %rbx, %rsi -; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rsi, %rdi -; CHECK-NEXT: movq $-1, %rbp +; CHECK-NEXT: cmovlq %rbx, %rcx +; CHECK-NEXT: cmovlq %r14, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000 +; CHECK-NEXT: cmpq %rsi, %r8 ; CHECK-NEXT: movq $-1, %rbx -; CHECK-NEXT: sbbq %rcx, %rbx -; CHECK-NEXT: cmovgeq %rdi, %rsi -; CHECK-NEXT: cmpq %rax, %rdi -; CHECK-NEXT: sbbq %rdx, %rbp -; CHECK-NEXT: cmovgeq %rdi, %rax +; CHECK-NEXT: movq $-1, %rdi +; CHECK-NEXT: sbbq %rcx, %rdi +; CHECK-NEXT: cmovgeq %r8, %rsi +; CHECK-NEXT: cmpq %rax, %r8 +; CHECK-NEXT: sbbq %rdx, %rbx +; CHECK-NEXT: cmovgeq %r8, %rax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movq %rsi, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1629,24 +1461,22 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-LABEL: utesth_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movzwl %si, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: cmovneq %rcx, %rax @@ -1655,11 +1485,11 @@ ; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1673,31 +1503,28 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) { ; CHECK-LABEL: ustest_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %r14d -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovgq %rsi, %rdx ; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: testq %rbp, %rbp -; CHECK-NEXT: cmovleq %rbp, %rsi +; CHECK-NEXT: testq %r14, %r14 +; CHECK-NEXT: cmovleq %r14, %rsi ; CHECK-NEXT: cmovgq %rcx, %rbx ; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: negq %rdi @@ -1712,11 +1539,11 @@ ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movq %rbx, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2115,40 +1942,30 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-LABEL: stest_f16i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] @@ -2168,7 +1985,7 @@ ; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: pandn %xmm4, %xmm1 ; CHECK-NEXT: por %xmm2, %xmm1 -; CHECK-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm3, %xmm5 @@ -2211,13 +2028,7 @@ ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2231,65 +2042,31 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-LABEL: utesth_f16i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %ecx, %ebp -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl %si, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] @@ -2308,7 +2085,7 @@ ; CHECK-NEXT: pand %xmm4, %xmm0 ; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; CHECK-NEXT: pxor %xmm6, %xmm1 ; CHECK-NEXT: movdqa %xmm3, %xmm0 ; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 @@ -2322,13 +2099,7 @@ ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm6, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2341,40 +2112,30 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-LABEL: ustest_f16i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: subq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfdi@PLT ; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] @@ -2394,7 +2155,7 @@ ; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: pandn %xmm4, %xmm1 ; CHECK-NEXT: por %xmm2, %xmm1 -; CHECK-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm3, %xmm5 @@ -2430,13 +2191,7 @@ ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2596,98 +2351,67 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2701,79 +2425,60 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] @@ -2800,19 +2505,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm4, %xmm0 -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2825,79 +2518,60 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %esi, %r14d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d -; CHECK-NEXT: movzwl %cx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r13d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r12d, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfsi@PLT ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] @@ -2927,19 +2601,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm3, %xmm0 -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -3283,24 +2945,21 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: stest_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movzwl %bp, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: movq %rcx, %rsi @@ -3318,28 +2977,28 @@ ; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: cmovsq %r14, %rdi ; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: movabsq $-9223372036854775808, %rbp # imm = 0x8000000000000000 -; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 +; CHECK-NEXT: movq %rbx, %rsi ; CHECK-NEXT: cmovnsq %rcx, %rsi -; CHECK-NEXT: cmpq %rbp, %rcx -; CHECK-NEXT: cmovbeq %rbp, %rcx +; CHECK-NEXT: cmpq %rbx, %rcx +; CHECK-NEXT: cmovbeq %rbx, %rcx ; CHECK-NEXT: cmpq $-1, %rdi ; CHECK-NEXT: cmovneq %rsi, %rcx ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: movq %rbx, %rsi ; CHECK-NEXT: cmovnsq %rax, %rsi -; CHECK-NEXT: cmpq %rbp, %rax -; CHECK-NEXT: cmovbeq %rbp, %rax +; CHECK-NEXT: cmpq %rbx, %rax +; CHECK-NEXT: cmovbeq %rbx, %rax ; CHECK-NEXT: cmpq $-1, %rdx ; CHECK-NEXT: cmovneq %rsi, %rax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -3353,41 +3012,39 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: utesth_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movzwl %si, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixunssfti@PLT +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: cmpq $1, %rdx ; CHECK-NEXT: cmoveq %rcx, %rax -; CHECK-NEXT: testq %rbp, %rbp +; CHECK-NEXT: testq %r14, %r14 ; CHECK-NEXT: cmovneq %rcx, %rbx -; CHECK-NEXT: cmpq $1, %rbp +; CHECK-NEXT: cmpq $1, %r14 ; CHECK-NEXT: cmoveq %rcx, %rbx ; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -3400,24 +3057,21 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: ustest_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %r14d -; CHECK-NEXT: movzwl %di, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: movzwl %r14w, %edi -; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: callq __fixsfti@PLT +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movl $1, %esi @@ -3426,10 +3080,10 @@ ; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: cmpq $1, %rdx ; CHECK-NEXT: cmoveq %rcx, %rax -; CHECK-NEXT: testq %rbp, %rbp -; CHECK-NEXT: cmovleq %rbp, %rsi +; CHECK-NEXT: testq %r14, %r14 +; CHECK-NEXT: cmovleq %r14, %rsi ; CHECK-NEXT: cmovgq %rcx, %rbx -; CHECK-NEXT: cmpq $1, %rbp +; CHECK-NEXT: cmpq $1, %r14 ; CHECK-NEXT: cmoveq %rcx, %rbx ; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: cmovsq %rcx, %rbx @@ -3438,11 +3092,11 @@ ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movq %rbx, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: popq %rbx +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -2109,15 +2109,22 @@ ; X86-SSE-LABEL: test_signed_i1_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: cvttss2si %xmm0, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $255, %eax +; X86-SSE-NEXT: cmovael %ecx, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: xorps %xmm1, %xmm1 -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmoval %ecx, %eax +; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 +; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2125,13 +2132,17 @@ ; X64-LABEL: test_signed_i1_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: minss %xmm1, %xmm0 -; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: cvttss2si %xmm0, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $255, %eax +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmoval %ecx, %eax +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: cmovpl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2192,15 +2203,22 @@ ; X86-SSE-LABEL: test_signed_i8_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $128, %ecx +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $127, %edx +; X86-SSE-NEXT: cmovbel %ecx, %edx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 +; X86-SSE-NEXT: cmovnpl %edx, %eax ; X86-SSE-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2208,13 +2226,17 @@ ; X64-LABEL: test_signed_i8_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $128, %ecx +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $127, %edx +; X64-NEXT: cmovbel %ecx, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: cmovnpl %edx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2276,15 +2298,22 @@ ; X86-SSE-LABEL: test_signed_i13_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $61440, %ecx # imm = 0xF000 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $4095, %edx # imm = 0xFFF +; X86-SSE-NEXT: cmovbel %ecx, %edx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 +; X86-SSE-NEXT: cmovnpl %edx, %eax ; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2292,13 +2321,17 @@ ; X64-LABEL: test_signed_i13_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $61440, %ecx # imm = 0xF000 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $4095, %edx # imm = 0xFFF +; X64-NEXT: cmovbel %ecx, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: cmovnpl %edx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2360,15 +2393,22 @@ ; X86-SSE-LABEL: test_signed_i16_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $32767, %edx # imm = 0x7FFF +; X86-SSE-NEXT: cmovbel %ecx, %edx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 +; X86-SSE-NEXT: cmovnpl %edx, %eax ; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2376,13 +2416,17 @@ ; X64-LABEL: test_signed_i16_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $32767, %edx # imm = 0x7FFF +; X64-NEXT: cmovbel %ecx, %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: cmovnpl %edx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2444,31 +2488,39 @@ ; X86-SSE-LABEL: test_signed_i19_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $-262144, %ecx # imm = 0xFFFC0000 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $262143, %edx # imm = 0x3FFFF +; X86-SSE-NEXT: cmovbel %ecx, %edx ; X86-SSE-NEXT: xorl %eax, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 -; X86-SSE-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: cvttss2si %xmm0, %ecx -; X86-SSE-NEXT: cmovnpl %ecx, %eax +; X86-SSE-NEXT: cmovnpl %edx, %eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl ; ; X64-LABEL: test_signed_i19_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT +; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $-262144, %ecx # imm = 0xFFFC0000 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $262143, %edx # imm = 0x3FFFF +; X64-NEXT: cmovbel %ecx, %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm0, %xmm0 -; X64-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: cvttss2si %xmm0, %ecx -; X64-NEXT: cmovnpl %ecx, %eax +; X64-NEXT: cmovnpl %edx, %eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq %x = call i19 @llvm.fptosi.sat.i19.f16(half %f) @@ -2529,33 +2581,39 @@ ; X86-SSE-LABEL: test_signed_i32_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: cvttss2si %xmm0, %eax ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %eax, %ecx +; X86-SSE-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ecx, %edx ; X86-SSE-NEXT: xorl %eax, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 -; X86-SSE-NEXT: cmovnpl %ecx, %eax +; X86-SSE-NEXT: cmovnpl %edx, %eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl ; ; X64-LABEL: test_signed_i32_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: cmovbel %eax, %ecx +; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF +; X64-NEXT: cmovbel %ecx, %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm0, %xmm0 -; X64-NEXT: cmovnpl %ecx, %eax +; X64-NEXT: cmovnpl %edx, %eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq %x = call i32 @llvm.fptosi.sat.i32.f16(half %f) @@ -2634,8 +2692,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2669,7 +2728,6 @@ ; X64-LABEL: test_signed_i50_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2759,8 +2817,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2794,15 +2853,17 @@ ; X64-LABEL: test_signed_i64_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: cmovbeq %rax, %rcx +; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; X64-NEXT: cmovaeq %rax, %rcx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: cmovbeq %rcx, %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm0, %xmm0 -; X64-NEXT: cmovnpq %rcx, %rax +; X64-NEXT: cmovnpq %rdx, %rax ; X64-NEXT: popq %rcx ; X64-NEXT: retq %x = call i64 @llvm.fptosi.sat.i64.f16(half %f) @@ -2908,9 +2969,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $44, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2963,7 +3025,6 @@ ; X64-LABEL: test_signed_i100_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixsfti@PLT @@ -3093,9 +3154,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $44, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -3144,7 +3206,6 @@ ; X64-LABEL: test_signed_i128_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixsfti@PLT diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -542,97 +542,119 @@ ; CHECK-LABEL: test_signed_v8i1_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $65535, %ebp # imm = 0xFFFF +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebp, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebx, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -640,12 +662,8 @@ ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> %f) @@ -658,99 +676,121 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movl %r9d, %r13d -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl %cx, %edi +; CHECK-NEXT: subq $48, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %r12d -; CHECK-NEXT: shll $8, %r12d -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $128, %r14d +; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $127, %r12d +; CHECK-NEXT: cmoval %r12d, %ebp +; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %ebp +; CHECK-NEXT: shll $8, %ebp +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %eax ; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: orl %r12d, %ebx -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %ebp +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %ebp ; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %ebx +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %ebx +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: addq $48, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp @@ -763,97 +803,120 @@ ; CHECK-LABEL: test_signed_v8i16_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32768, %r14d # imm = 0x8000 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $32767, %ebp # imm = 0x7FFF +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -861,12 +924,9 @@ ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: addq $128, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %f) @@ -877,112 +937,129 @@ ; CHECK-LABEL: test_signed_v8i32_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d -; CHECK-NEXT: movzwl %cx, %edi +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $2147483647, %r15d # imm = 0x7FFFFFFF -; CHECK-NEXT: cmoval %r15d, %eax -; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: movl $-2147483648, %r14d # imm = 0x80000000 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r13d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r12d, %eax +; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $128, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> %f) @@ -992,111 +1069,129 @@ define <8 x i64> @test_signed_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_signed_v8i64_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $104, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %r15, %rax -; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: movabsq $-9223372036854775808, %r14 # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rbx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: xorl %r15d, %r15d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax +; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovaq %rbx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r12, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: cmovpq %r15, %rax +; CHECK-NEXT: movq %rax, %xmm3 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; CHECK-NEXT: addq $104, %rsp +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: addq $128, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> %f) ret <8 x i64> %x @@ -1112,112 +1207,115 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ecx, %r14d -; CHECK-NEXT: movl %edx, %r12d +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl %si, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-NEXT: cmovbq %rcx, %rdx -; CHECK-NEXT: movq %rcx, %r13 +; CHECK-NEXT: movq %rcx, %r14 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %r15, %rdx +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %rcx, %rdx +; CHECK-NEXT: movq %rcx, %rbp ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax +; CHECK-NEXT: movq $-1, %r15 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %rdx +; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax -; CHECK-NEXT: cmovbq %r13, %rdx +; CHECK-NEXT: cmovbq %r12, %rax +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: cmovaq %rcx, %rax -; CHECK-NEXT: movq $-1, %r12 +; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: movq $-1, %r15 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %rdx +; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax -; CHECK-NEXT: cmovbq %r13, %rdx +; CHECK-NEXT: cmovbq %r12, %rax +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r12, %rax -; CHECK-NEXT: movq $-1, %r14 +; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: movq $-1, %r15 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %rdx +; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax -; CHECK-NEXT: cmovbq %r13, %rdx +; CHECK-NEXT: cmovbq %r12, %rax +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r14, %rax -; CHECK-NEXT: movq $-1, %r14 +; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: movq %rbp, %r13 +; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: movq $-1, %r15 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %rdx +; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rdx, %r12 +; CHECK-NEXT: movq %rdx, %rbp ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax -; CHECK-NEXT: cmovbq %r13, %r12 +; CHECK-NEXT: cmovbq %r12, %rax +; CHECK-NEXT: cmovbq %r14, %rbp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r15, %r12 -; CHECK-NEXT: cmovaq %r14, %rax +; CHECK-NEXT: cmovaq %r13, %rbp +; CHECK-NEXT: cmovaq %r15, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %r12 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: cmovpq %r12, %rbp +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1226,38 +1324,40 @@ ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %r14 -; CHECK-NEXT: cmovbq %r13, %r15 +; CHECK-NEXT: cmovbq %r12, %r14 +; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %rax, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rax, %r15 +; CHECK-NEXT: cmovaq %r13, %r15 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %r14 -; CHECK-NEXT: cmovpq %rbp, %r15 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: cmovpq %r12, %r14 +; CHECK-NEXT: cmovpq %r12, %r15 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rax, %r13 -; CHECK-NEXT: movq %rdx, %rbp +; CHECK-NEXT: movq %rax, %r12 +; CHECK-NEXT: movq %rdx, %r13 ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movl $0, %eax -; CHECK-NEXT: cmovbq %rax, %r13 +; CHECK-NEXT: cmovbq %rax, %r12 ; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %rcx, %rbp +; CHECK-NEXT: cmovbq %rcx, %r13 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rcx, %rbp -; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %r13 +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: cmovaq %rcx, %r12 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovpq %rax, %r12 ; CHECK-NEXT: cmovpq %rax, %r13 -; CHECK-NEXT: cmovpq %rax, %rbp -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1279,11 +1379,11 @@ ; CHECK-NEXT: cmovpq %rcx, %rdx ; CHECK-NEXT: movq %rdx, 120(%rbx) ; CHECK-NEXT: movq %rax, 112(%rbx) -; CHECK-NEXT: movq %rbp, 104(%rbx) -; CHECK-NEXT: movq %r13, 96(%rbx) +; CHECK-NEXT: movq %r13, 104(%rbx) +; CHECK-NEXT: movq %r12, 96(%rbx) ; CHECK-NEXT: movq %r15, 88(%rbx) ; CHECK-NEXT: movq %r14, 80(%rbx) -; CHECK-NEXT: movq %r12, 72(%rbx) +; CHECK-NEXT: movq %rbp, 72(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 64(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1922,15 +1922,20 @@ ; X86-SSE-LABEL: test_unsigned_i1_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx +; X86-SSE-NEXT: xorps %xmm1, %xmm1 +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $1, %eax +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -1938,13 +1943,15 @@ ; X64-LABEL: test_unsigned_i1_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -1997,15 +2004,20 @@ ; X86-SSE-LABEL: test_unsigned_i8_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx +; X86-SSE-NEXT: xorps %xmm1, %xmm1 +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $255, %eax +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2013,13 +2025,15 @@ ; X64-LABEL: test_unsigned_i8_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $255, %eax +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2071,15 +2085,20 @@ ; X86-SSE-LABEL: test_unsigned_i13_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx +; X86-SSE-NEXT: xorps %xmm1, %xmm1 +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $8191, %eax # imm = 0x1FFF +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2087,13 +2106,15 @@ ; X64-LABEL: test_unsigned_i13_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $8191, %eax # imm = 0x1FFF +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2145,15 +2166,20 @@ ; X86-SSE-LABEL: test_unsigned_i16_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 -; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: minss %xmm0, %xmm1 -; X86-SSE-NEXT: cvttss2si %xmm1, %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: xorl %ecx, %ecx +; X86-SSE-NEXT: xorps %xmm1, %xmm1 +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmovael %eax, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $65535, %eax # imm = 0xFFFF +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl @@ -2161,13 +2187,15 @@ ; X64-LABEL: test_unsigned_i16_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: maxss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: minss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $65535, %eax # imm = 0xFFFF +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -2219,27 +2247,42 @@ ; X86-SSE-LABEL: test_unsigned_i19_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: xorps %xmm1, %xmm1 -; X86-SSE-NEXT: maxss %xmm1, %xmm0 -; X86-SSE-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: movl %eax, %ecx +; X86-SSE-NEXT: sarl $31, %ecx +; X86-SSE-NEXT: movaps %xmm0, %xmm1 +; X86-SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: cvttss2si %xmm1, %edx +; X86-SSE-NEXT: andl %ecx, %edx +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: xorl %ecx, %ecx +; X86-SSE-NEXT: xorps %xmm1, %xmm1 +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 +; X86-SSE-NEXT: cmovael %edx, %ecx +; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movl $524287, %eax # imm = 0x7FFFF +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl ; ; X64-LABEL: test_unsigned_i19_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT +; X64-NEXT: cvttss2si %xmm0, %rax +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: maxss %xmm1, %xmm0 -; X64-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: cmovael %eax, %ecx +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movl $524287, %eax # imm = 0x7FFFF +; X64-NEXT: cmovbel %ecx, %eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq %x = call i19 @llvm.fptoui.sat.i19.f16(half %f) @@ -2290,8 +2333,9 @@ ; X86-SSE-LABEL: test_unsigned_i32_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2316,7 +2360,6 @@ ; X64-LABEL: test_unsigned_i32_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: xorl %ecx, %ecx @@ -2406,8 +2449,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2452,13 +2496,19 @@ ; X64-LABEL: test_unsigned_i50_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttss2si %xmm1, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rax, %rcx +; X64-NEXT: cmovaeq %rdx, %rcx ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movabsq $1125899906842623, %rax # imm = 0x3FFFFFFFFFFFF ; X64-NEXT: cmovbeq %rcx, %rax @@ -2540,8 +2590,9 @@ ; X86-SSE-LABEL: test_unsigned_i64_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $28, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2584,7 +2635,6 @@ ; X64-LABEL: test_unsigned_i64_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: movq %rax, %rcx @@ -2689,9 +2739,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2739,7 +2790,6 @@ ; X64-LABEL: test_unsigned_i100_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixunssfti@PLT @@ -2840,9 +2890,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2888,7 +2939,6 @@ ; X64-LABEL: test_unsigned_i128_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixunssfti@PLT diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -541,97 +541,103 @@ ; CHECK-LABEL: test_unsigned_v8i1_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $1, %ebp +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -639,12 +645,8 @@ ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> %f) @@ -657,99 +659,103 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movl %r9d, %r13d -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl %cx, %edi +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %r12d -; CHECK-NEXT: shll $8, %r12d -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $255, %r15d +; CHECK-NEXT: cmoval %r15d, %ebp +; CHECK-NEXT: shll $8, %ebp +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %eax ; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: orl %r12d, %ebx -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %ebp ; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %r15d, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp @@ -762,97 +768,103 @@ ; CHECK-LABEL: test_unsigned_v8i16_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movl %r9d, %ebp -; CHECK-NEXT: movl %r8d, %ebx -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %esi, %r15d -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm7, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movl $65535, %ebp # imm = 0xFFFF +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmovbl %ebx, %eax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -860,12 +872,8 @@ ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %f) @@ -876,113 +884,111 @@ ; CHECK-LABEL: test_unsigned_v8i32_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edx, %r14d -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d -; CHECK-NEXT: movzwl %cx, %edi +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $-1, %r12d -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: movl $-1, %ebp +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r13d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r15d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> %f) @@ -992,21 +998,17 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_unsigned_v8i64_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $104, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1016,18 +1018,19 @@ ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movq $-1, %r12 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: movq $-1, %rbx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1036,16 +1039,17 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1054,14 +1058,15 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1070,16 +1075,17 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1088,14 +1094,15 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1104,16 +1111,17 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1122,14 +1130,15 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx +; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1138,22 +1147,18 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r15, %rdx +; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r12, %rdx -; CHECK-NEXT: movq %rdx, %xmm2 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: movq %rdx, %xmm3 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; CHECK-NEXT: addq $104, %rsp +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> %f) ret <8 x i64> %x @@ -1169,91 +1174,92 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %ecx, %r14d -; CHECK-NEXT: movl %edx, %r12d +; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl %si, %edi ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: xorl %r13d, %r13d +; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm0, %xmm1 -; CHECK-NEXT: cmovbq %r13, %rdx -; CHECK-NEXT: cmovbq %r13, %rax +; CHECK-NEXT: cmovbq %r12, %rdx +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: movq $-1, %rbp -; CHECK-NEXT: cmovaq %rbp, %rax +; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl %r12w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r13, %rdx -; CHECK-NEXT: cmovbq %r13, %rax +; CHECK-NEXT: cmovbq %r12, %rdx +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rax +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r13, %rdx -; CHECK-NEXT: cmovbq %r13, %rax +; CHECK-NEXT: cmovbq %r12, %rdx +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rax +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r13, %rdx -; CHECK-NEXT: cmovbq %r13, %rax +; CHECK-NEXT: cmovbq %r12, %rdx +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rax +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %rbp, %rdx +; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movq %rdx, %r12 +; CHECK-NEXT: movq %rdx, %rbp ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r13, %r12 -; CHECK-NEXT: cmovbq %r13, %rax +; CHECK-NEXT: cmovbq %r12, %rbp +; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rax +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovaq %rbp, %r12 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: cmovaq %r13, %rbp +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1262,28 +1268,30 @@ ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r13, %r15 -; CHECK-NEXT: cmovbq %r13, %r14 +; CHECK-NEXT: cmovbq %r12, %r15 +; CHECK-NEXT: cmovbq %r12, %r14 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %r14 -; CHECK-NEXT: cmovaq %rbp, %r15 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: cmovaq %r13, %r14 +; CHECK-NEXT: cmovaq %r13, %r15 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movq %rax, %r13 -; CHECK-NEXT: movq %rdx, %rbp +; CHECK-NEXT: movq %rax, %r12 +; CHECK-NEXT: movq %rdx, %r13 ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movl $0, %eax -; CHECK-NEXT: cmovbq %rax, %rbp ; CHECK-NEXT: cmovbq %rax, %r13 +; CHECK-NEXT: cmovbq %rax, %r12 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: cmovaq %rax, %r12 ; CHECK-NEXT: cmovaq %rax, %r13 -; CHECK-NEXT: cmovaq %rax, %rbp -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1299,11 +1307,11 @@ ; CHECK-NEXT: cmovaq %rcx, %rdx ; CHECK-NEXT: movq %rdx, 120(%rbx) ; CHECK-NEXT: movq %rax, 112(%rbx) -; CHECK-NEXT: movq %rbp, 104(%rbx) -; CHECK-NEXT: movq %r13, 96(%rbx) +; CHECK-NEXT: movq %r13, 104(%rbx) +; CHECK-NEXT: movq %r12, 96(%rbx) ; CHECK-NEXT: movq %r15, 88(%rbx) ; CHECK-NEXT: movq %r14, 80(%rbx) -; CHECK-NEXT: movq %r12, 72(%rbx) +; CHECK-NEXT: movq %rbp, 72(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 64(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll --- a/llvm/test/CodeGen/X86/freeze.ll +++ b/llvm/test/CodeGen/X86/freeze.ll @@ -38,14 +38,10 @@ ; X86ASM: # %bb.0: ; X86ASM-NEXT: pushq %rax ; X86ASM-NEXT: .cfi_def_cfa_offset 16 -; X86ASM-NEXT: xorl %edi, %edi -; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT -; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT -; X86ASM-NEXT: movzwl %ax, %edi ; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT ; X86ASM-NEXT: addss %xmm0, %xmm0 ; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT -; X86ASM-NEXT: popq %rcx +; X86ASM-NEXT: popq %rax ; X86ASM-NEXT: .cfi_def_cfa_offset 8 ; X86ASM-NEXT: retq %y1 = freeze half undef diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -36,7 +36,7 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -81,7 +81,7 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 ; X64-NOF16C-NEXT: cvtss2sd %xmm0, %xmm0 ; X64-NOF16C-NEXT: popq %rax @@ -112,37 +112,30 @@ ; ; X32-F16C-LABEL: half_to_fp80: ; X32-F16C: ## %bb.0: -; X32-F16C-NEXT: pushl %eax -; X32-F16C-NEXT: .cfi_def_cfa_offset 8 -; X32-F16C-NEXT: movzwl _a, %eax -; X32-F16C-NEXT: vmovd %eax, %xmm0 -; X32-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; X32-F16C-NEXT: vmovss %xmm0, (%esp) -; X32-F16C-NEXT: flds (%esp) -; X32-F16C-NEXT: wait -; X32-F16C-NEXT: popl %eax +; X32-F16C-NEXT: subl $12, %esp +; X32-F16C-NEXT: .cfi_def_cfa_offset 16 +; X32-F16C-NEXT: vpinsrw $0, _a, %xmm0, %xmm0 +; X32-F16C-NEXT: vpextrw $0, %xmm0, (%esp) +; X32-F16C-NEXT: calll ___extendhfxf2 +; X32-F16C-NEXT: addl $12, %esp ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: half_to_fp80: ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi -; X64-NOF16C-NEXT: callq ___extendhfsf2 -; X64-NOF16C-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) -; X64-NOF16C-NEXT: flds {{[0-9]+}}(%rsp) -; X64-NOF16C-NEXT: wait +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 +; X64-NOF16C-NEXT: callq ___extendhfxf2 ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq ; ; X64-F16C-LABEL: half_to_fp80: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: movzwl _a(%rip), %eax -; X64-F16C-NEXT: vmovd %eax, %xmm0 -; X64-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) -; X64-F16C-NEXT: flds -{{[0-9]+}}(%rsp) -; X64-F16C-NEXT: wait +; X64-F16C-NEXT: pushq %rax +; X64-F16C-NEXT: .cfi_def_cfa_offset 16 +; X64-F16C-NEXT: vpinsrw $0, _a(%rip), %xmm0, %xmm0 +; X64-F16C-NEXT: callq ___extendhfxf2 +; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %1 = load half, half* @a, align 2 %2 = tail call x86_fp80 @llvm.experimental.constrained.fpext.f80.f16(half %1, metadata !"fpexcept.strict") #0 @@ -166,7 +159,8 @@ ; X32-F16C: ## %bb.0: ; X32-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X32-F16C-NEXT: vpextrw $0, %xmm0, _a +; X32-F16C-NEXT: vmovd %xmm0, %eax +; X32-F16C-NEXT: movw %ax, _a ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: float_to_half: @@ -174,6 +168,7 @@ ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X64-NOF16C-NEXT: callq ___truncsfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -183,7 +178,8 @@ ; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip) +; X64-F16C-NEXT: vmovd %xmm0, %eax +; X64-F16C-NEXT: movw %ax, _a(%rip) ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 store half %2, half* @a, align 2 @@ -205,13 +201,13 @@ ; ; X32-F16C-LABEL: double_to_half: ; X32-F16C: ## %bb.0: -; X32-F16C-NEXT: subl $12, %esp -; X32-F16C-NEXT: .cfi_def_cfa_offset 16 ; X32-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-F16C-NEXT: vmovsd %xmm0, (%esp) -; X32-F16C-NEXT: calll ___truncdfhf2 +; X32-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X32-F16C-NEXT: vmovd %xmm0, %eax ; X32-F16C-NEXT: movw %ax, _a -; X32-F16C-NEXT: addl $12, %esp ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: double_to_half: @@ -219,17 +215,19 @@ ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X64-NOF16C-NEXT: callq ___truncdfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq ; ; X64-F16C-LABEL: double_to_half: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: pushq %rax -; X64-F16C-NEXT: .cfi_def_cfa_offset 16 -; X64-F16C-NEXT: callq ___truncdfhf2 +; X64-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-F16C-NEXT: vmovd %xmm0, %eax ; X64-F16C-NEXT: movw %ax, _a(%rip) -; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 store half %2, half* @a, align 2 @@ -251,14 +249,18 @@ ; ; X32-F16C-LABEL: fp80_to_half: ; X32-F16C: ## %bb.0: -; X32-F16C-NEXT: subl $28, %esp -; X32-F16C-NEXT: .cfi_def_cfa_offset 32 +; X32-F16C-NEXT: subl $12, %esp +; X32-F16C-NEXT: .cfi_def_cfa_offset 16 ; X32-F16C-NEXT: fldt {{[0-9]+}}(%esp) -; X32-F16C-NEXT: fstpt (%esp) +; X32-F16C-NEXT: fstps {{[0-9]+}}(%esp) ; X32-F16C-NEXT: wait -; X32-F16C-NEXT: calll ___truncxfhf2 +; X32-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X32-F16C-NEXT: vmovd %xmm0, %eax ; X32-F16C-NEXT: movw %ax, _a -; X32-F16C-NEXT: addl $28, %esp +; X32-F16C-NEXT: addl $12, %esp ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: fp80_to_half: @@ -269,20 +271,22 @@ ; X64-NOF16C-NEXT: fstpt (%rsp) ; X64-NOF16C-NEXT: wait ; X64-NOF16C-NEXT: callq ___truncxfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: addq $24, %rsp ; X64-NOF16C-NEXT: retq ; ; X64-F16C-LABEL: fp80_to_half: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: subq $24, %rsp -; X64-F16C-NEXT: .cfi_def_cfa_offset 32 ; X64-F16C-NEXT: fldt {{[0-9]+}}(%rsp) -; X64-F16C-NEXT: fstpt (%rsp) +; X64-F16C-NEXT: fstps -{{[0-9]+}}(%rsp) ; X64-F16C-NEXT: wait -; X64-F16C-NEXT: callq ___truncxfhf2 +; X64-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-F16C-NEXT: vmovd %xmm0, %eax ; X64-F16C-NEXT: movw %ax, _a(%rip) -; X64-F16C-NEXT: addq $24, %rsp ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f80(x86_fp80 %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 store half %2, half* @a, align 2 @@ -323,20 +327,22 @@ ; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X32-F16C-NEXT: vpextrw $0, %xmm0, _c +; X32-F16C-NEXT: vmovd %xmm0, %eax +; X32-F16C-NEXT: movw %ax, _c ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: add: ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 -; X64-NOF16C-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; X64-NOF16C-NEXT: movzwl _b(%rip), %edi +; X64-NOF16C-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; X64-NOF16C-NEXT: pinsrw $0, _b(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 ; X64-NOF16C-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload ; X64-NOF16C-NEXT: callq ___truncsfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _c(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -353,7 +359,8 @@ ; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-F16C-NEXT: vpextrw $0, %xmm0, _c(%rip) +; X64-F16C-NEXT: vmovd %xmm0, %eax +; X64-F16C-NEXT: movw %ax, _c(%rip) ; X64-F16C-NEXT: retq %1 = load half, half* @a, align 2 %2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \ -; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C +; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=0 \ ; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \ @@ -9,23 +9,25 @@ ; RUN: | FileCheck %s -check-prefixes=CHECK-I686 define void @test_load_store(half* %in, half* %out) #0 { -; BWON-LABEL: test_load_store: -; BWON: # %bb.0: -; BWON-NEXT: movzwl (%rdi), %eax -; BWON-NEXT: movw %ax, (%rsi) -; BWON-NEXT: retq +; CHECK-LIBCALL-LABEL: test_load_store: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rsi) +; CHECK-LIBCALL-NEXT: retq ; -; BWOFF-LABEL: test_load_store: -; BWOFF: # %bb.0: -; BWOFF-NEXT: movw (%rdi), %ax -; BWOFF-NEXT: movw %ax, (%rsi) -; BWOFF-NEXT: retq +; BWON-F16C-LABEL: test_load_store: +; BWON-F16C: # %bb.0: +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_load_store: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: movw (%ecx), %cx +; CHECK-I686-NEXT: pinsrw $0, (%ecx), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl %val = load half, half* %in @@ -74,7 +76,7 @@ define float @test_extend32(half* %addr) #0 { ; CHECK-LIBCALL-LABEL: test_extend32: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; BWON-F16C-LABEL: test_extend32: @@ -88,8 +90,9 @@ ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl @@ -102,7 +105,7 @@ ; CHECK-LIBCALL-LABEL: test_extend64: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 ; CHECK-LIBCALL-NEXT: popq %rax @@ -120,8 +123,9 @@ ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl @@ -136,6 +140,7 @@ ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq @@ -143,7 +148,8 @@ ; BWON-F16C-LABEL: test_trunc32: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rdi) +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rdi) ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc32: @@ -151,9 +157,10 @@ ; CHECK-I686-NEXT: pushl %esi ; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movd %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) ; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi @@ -164,23 +171,33 @@ } define void @test_trunc64(double %in, half* %addr) #0 { -; CHECK-LABEL: test_trunc64: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq __truncdfhf2@PLT -; CHECK-NEXT: movw %ax, (%rbx) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc64: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_trunc64: +; BWON-F16C: # %bb.0: +; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rdi) +; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi ; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movsd %xmm0, (%esp) +; CHECK-I686-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movq %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) ; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi @@ -194,39 +211,28 @@ ; CHECK-LIBCALL-LABEL: test_fptosi_i64: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: callq __fixhfdi@PLT ; CHECK-LIBCALL-NEXT: popq %rcx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_fptosi_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax +; BWON-F16C-NEXT: pushq %rax +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: callq __fixhfdi@PLT +; BWON-F16C-NEXT: popq %rcx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_fptosi_i64: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $28, %esp -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: orl $3072, %eax # imm = 0xC00 -; CHECK-I686-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-I686-NEXT: addl $28, %esp +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: calll __fixhfdi +; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 %r = fptosi half %a to i64 @@ -238,33 +244,34 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rsi, %rbx -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: callq __floatdihf@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_sitofp_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rsi, %rbx +; BWON-F16C-NEXT: callq __floatdihf@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sitofp_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: calll __floatdihf +; CHECK-I686-NEXT: addl $16, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) -; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %r = sitofp i64 %a to half @@ -276,63 +283,28 @@ ; CHECK-LIBCALL-LABEL: test_fptoui_i64: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rcx -; CHECK-LIBCALL-NEXT: movq %rcx, %rdx -; CHECK-LIBCALL-NEXT: sarq $63, %rdx -; CHECK-LIBCALL-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax -; CHECK-LIBCALL-NEXT: andq %rdx, %rax -; CHECK-LIBCALL-NEXT: orq %rcx, %rax +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: callq __fixhfdi@PLT ; CHECK-LIBCALL-NEXT: popq %rcx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_fptoui_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvttss2si %xmm0, %rcx -; BWON-F16C-NEXT: movq %rcx, %rdx -; BWON-F16C-NEXT: sarq $63, %rdx -; BWON-F16C-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax -; BWON-F16C-NEXT: andq %rdx, %rax -; BWON-F16C-NEXT: orq %rcx, %rax +; BWON-F16C-NEXT: pushq %rax +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: callq __fixhfdi@PLT +; BWON-F16C-NEXT: popq %rcx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_fptoui_i64: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $28, %esp -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-I686-NEXT: jae .LBB9_2 -; CHECK-I686-NEXT: # %bb.1: -; CHECK-I686-NEXT: xorps %xmm1, %xmm1 -; CHECK-I686-NEXT: .LBB9_2: -; CHECK-I686-NEXT: subss %xmm1, %xmm0 -; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: setae %al -; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: orl $3072, %ecx # imm = 0xC00 -; CHECK-I686-NEXT: movw %cx, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movzbl %al, %edx -; CHECK-I686-NEXT: shll $31, %edx -; CHECK-I686-NEXT: xorl {{[0-9]+}}(%esp), %edx +; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: addl $28, %esp +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: calll __fixunshfdi +; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 %r = fptoui half %a to i64 @@ -344,58 +316,34 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rsi, %rbx -; CHECK-LIBCALL-NEXT: testq %rdi, %rdi -; CHECK-LIBCALL-NEXT: js .LBB10_1 -; CHECK-LIBCALL-NEXT: # %bb.2: -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: jmp .LBB10_3 -; CHECK-LIBCALL-NEXT: .LBB10_1: -; CHECK-LIBCALL-NEXT: movq %rdi, %rax -; CHECK-LIBCALL-NEXT: shrq %rax -; CHECK-LIBCALL-NEXT: andl $1, %edi -; CHECK-LIBCALL-NEXT: orq %rax, %rdi -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: .LBB10_3: -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: callq __floatundihf@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_uitofp_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: testq %rdi, %rdi -; BWON-F16C-NEXT: js .LBB10_1 -; BWON-F16C-NEXT: # %bb.2: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: jmp .LBB10_3 -; BWON-F16C-NEXT: .LBB10_1: -; BWON-F16C-NEXT: movq %rdi, %rax -; BWON-F16C-NEXT: shrq %rax -; BWON-F16C-NEXT: andl $1, %edi -; BWON-F16C-NEXT: orq %rax, %rdi -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: .LBB10_3: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rsi, %rbx +; BWON-F16C-NEXT: callq __floatundihf@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_uitofp_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: shrl $31, %eax -; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) -; CHECK-I686-NEXT: fstps (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: calll __floatundihf +; CHECK-I686-NEXT: addl $16, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) -; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %r = uitofp i64 %a to half @@ -406,36 +354,31 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend32_vec4: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: subq $88, %rsp -; CHECK-LIBCALL-NEXT: movl (%rdi), %eax -; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx -; CHECK-LIBCALL-NEXT: movl %eax, (%rsp) -; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; CHECK-LIBCALL-NEXT: subq $72, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 2(%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi +; CHECK-LIBCALL-NEXT: pinsrw $0, 4(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 6(%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-LIBCALL-NEXT: addq $88, %rsp +; CHECK-LIBCALL-NEXT: addq $72, %rsp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend32_vec4: @@ -445,32 +388,30 @@ ; ; CHECK-I686-LABEL: test_extend32_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $124, %esp +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl (%eax), %ecx -; CHECK-I686-NEXT: movl 4(%eax), %eax -; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 -; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, 6(%eax), %xmm0 ; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, 4(%eax), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, 2(%eax), %xmm1 +; CHECK-I686-NEXT: pextrw $0, %xmm1, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) @@ -485,7 +426,8 @@ ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-I686-NEXT: addl $124, %esp +; CHECK-I686-NEXT: addl $88, %esp +; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x float> @@ -495,37 +437,34 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend64_vec4: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rbp -; CHECK-LIBCALL-NEXT: pushq %r14 -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $32, %rsp -; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %r14d -; CHECK-LIBCALL-NEXT: movzwl 6(%rdi), %ebp -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %ebx -; CHECK-LIBCALL-NEXT: movzwl 2(%rdi), %edi +; CHECK-LIBCALL-NEXT: subq $72, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, 4(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 6(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 2(%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebx, %edi +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebp, %edi +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %r14d, %edi +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: addq $32, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx -; CHECK-LIBCALL-NEXT: popq %r14 -; CHECK-LIBCALL-NEXT: popq %rbp +; CHECK-LIBCALL-NEXT: addq $72, %rsp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend64_vec4: @@ -536,24 +475,30 @@ ; ; CHECK-I686-LABEL: test_extend64_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $64, %esp +; CHECK-I686-NEXT: subl $104, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl 6(%eax), %esi -; CHECK-I686-NEXT: movzwl (%eax), %edi -; CHECK-I686-NEXT: movzwl 2(%eax), %ebx -; CHECK-I686-NEXT: movzwl 4(%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, 6(%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, 2(%eax), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, 4(%eax), %xmm1 +; CHECK-I686-NEXT: pextrw $0, %xmm1, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movl %ebx, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movl %edi, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movl %esi, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) @@ -565,10 +510,8 @@ ; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; CHECK-I686-NEXT: addl $64, %esp +; CHECK-I686-NEXT: addl $104, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x double> @@ -576,71 +519,39 @@ } define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 { -; BWON-NOF16C-LABEL: test_trunc32_vec4: -; BWON-NOF16C: # %bb.0: -; BWON-NOF16C-NEXT: pushq %rbp -; BWON-NOF16C-NEXT: pushq %r15 -; BWON-NOF16C-NEXT: pushq %r14 -; BWON-NOF16C-NEXT: pushq %rbx -; BWON-NOF16C-NEXT: subq $24, %rsp -; BWON-NOF16C-NEXT: movq %rdi, %rbx -; BWON-NOF16C-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %r14d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %r15d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %ebp -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movw %ax, (%rbx) -; BWON-NOF16C-NEXT: movw %bp, 6(%rbx) -; BWON-NOF16C-NEXT: movw %r15w, 4(%rbx) -; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) -; BWON-NOF16C-NEXT: addq $24, %rsp -; BWON-NOF16C-NEXT: popq %rbx -; BWON-NOF16C-NEXT: popq %r14 -; BWON-NOF16C-NEXT: popq %r15 -; BWON-NOF16C-NEXT: popq %rbp -; BWON-NOF16C-NEXT: retq -; -; BWOFF-LABEL: test_trunc32_vec4: -; BWOFF: # %bb.0: -; BWOFF-NEXT: pushq %rbp -; BWOFF-NEXT: pushq %r15 -; BWOFF-NEXT: pushq %r14 -; BWOFF-NEXT: pushq %rbx -; BWOFF-NEXT: subq $24, %rsp -; BWOFF-NEXT: movq %rdi, %rbx -; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %r14w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %r15w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %bp -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, (%rbx) -; BWOFF-NEXT: movw %bp, 6(%rbx) -; BWOFF-NEXT: movw %r15w, 4(%rbx) -; BWOFF-NEXT: movw %r14w, 2(%rbx) -; BWOFF-NEXT: addq $24, %rsp -; BWOFF-NEXT: popq %rbx -; BWOFF-NEXT: popq %r14 -; BWOFF-NEXT: popq %r15 -; BWOFF-NEXT: popq %rbp -; BWOFF-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc32_vec4: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $64, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 2(%rbx) +; CHECK-LIBCALL-NEXT: addq $64, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_trunc32_vec4: ; BWON-F16C: # %bb.0: @@ -649,40 +560,41 @@ ; ; CHECK-I686-LABEL: test_trunc32_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebp -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $44, %esp +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-I686-NEXT: movaps %xmm0, %xmm1 ; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm1, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %bx -; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movd %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, (%ebp) -; CHECK-I686-NEXT: movw %bx, 6(%ebp) -; CHECK-I686-NEXT: movw %di, 4(%ebp) -; CHECK-I686-NEXT: movw %si, 2(%ebp) -; CHECK-I686-NEXT: addl $44, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 4(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 2(%esi) +; CHECK-I686-NEXT: addl $88, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx -; CHECK-I686-NEXT: popl %ebp ; CHECK-I686-NEXT: retl %v = fptrunc <4 x float> %a to <4 x half> store <4 x half> %v, <4 x half>* %p @@ -690,143 +602,98 @@ } define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 { -; BWON-NOF16C-LABEL: test_trunc64_vec4: -; BWON-NOF16C: # %bb.0: -; BWON-NOF16C-NEXT: pushq %rbp -; BWON-NOF16C-NEXT: pushq %r15 -; BWON-NOF16C-NEXT: pushq %r14 -; BWON-NOF16C-NEXT: pushq %rbx -; BWON-NOF16C-NEXT: subq $40, %rsp -; BWON-NOF16C-NEXT: movq %rdi, %rbx -; BWON-NOF16C-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; BWON-NOF16C-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %r14d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %r15d -; BWON-NOF16C-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %ebp -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movw %ax, 4(%rbx) -; BWON-NOF16C-NEXT: movw %bp, (%rbx) -; BWON-NOF16C-NEXT: movw %r15w, 6(%rbx) -; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) -; BWON-NOF16C-NEXT: addq $40, %rsp -; BWON-NOF16C-NEXT: popq %rbx -; BWON-NOF16C-NEXT: popq %r14 -; BWON-NOF16C-NEXT: popq %r15 -; BWON-NOF16C-NEXT: popq %rbp -; BWON-NOF16C-NEXT: retq -; -; BWOFF-LABEL: test_trunc64_vec4: -; BWOFF: # %bb.0: -; BWOFF-NEXT: pushq %rbp -; BWOFF-NEXT: pushq %r15 -; BWOFF-NEXT: pushq %r14 -; BWOFF-NEXT: pushq %rbx -; BWOFF-NEXT: subq $40, %rsp -; BWOFF-NEXT: movq %rdi, %rbx -; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; BWOFF-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %r14w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %r15w -; BWOFF-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %bp -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, 4(%rbx) -; BWOFF-NEXT: movw %bp, (%rbx) -; BWOFF-NEXT: movw %r15w, 6(%rbx) -; BWOFF-NEXT: movw %r14w, 2(%rbx) -; BWOFF-NEXT: addq $40, %rsp -; BWOFF-NEXT: popq %rbx -; BWOFF-NEXT: popq %r14 -; BWOFF-NEXT: popq %r15 -; BWOFF-NEXT: popq %rbp -; BWOFF-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc64_vec4: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $64, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 2(%rbx) +; CHECK-LIBCALL-NEXT: addq $64, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_trunc64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: pushq %rbp -; BWON-F16C-NEXT: pushq %r15 -; BWON-F16C-NEXT: pushq %r14 -; BWON-F16C-NEXT: pushq %rbx -; BWON-F16C-NEXT: subq $56, %rsp -; BWON-F16C-NEXT: movq %rdi, %rbx -; BWON-F16C-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; BWON-F16C-NEXT: vzeroupper -; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %r14d -; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 -; BWON-F16C-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; BWON-F16C-NEXT: vzeroupper -; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %r15d -; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; BWON-F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; BWON-F16C-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovd %xmm1, %eax +; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 +; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; BWON-F16C-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; BWON-F16C-NEXT: vmovd %xmm2, %ecx +; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %edx +; BWON-F16C-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %esi +; BWON-F16C-NEXT: movw %si, 4(%rdi) +; BWON-F16C-NEXT: movw %dx, (%rdi) +; BWON-F16C-NEXT: movw %cx, 6(%rdi) +; BWON-F16C-NEXT: movw %ax, 2(%rdi) ; BWON-F16C-NEXT: vzeroupper -; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %ebp -; BWON-F16C-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movw %ax, 4(%rbx) -; BWON-F16C-NEXT: movw %bp, (%rbx) -; BWON-F16C-NEXT: movw %r15w, 6(%rbx) -; BWON-F16C-NEXT: movw %r14w, 2(%rbx) -; BWON-F16C-NEXT: addq $56, %rsp -; BWON-F16C-NEXT: popq %rbx -; BWON-F16C-NEXT: popq %r14 -; BWON-F16C-NEXT: popq %r15 -; BWON-F16C-NEXT: popq %rbp ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebp -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $60, %esp +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %bx +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, 6(%ebp) -; CHECK-I686-NEXT: movw %bx, 4(%ebp) -; CHECK-I686-NEXT: movw %di, 2(%ebp) -; CHECK-I686-NEXT: movw %si, (%ebp) -; CHECK-I686-NEXT: addl $60, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 4(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 2(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $88, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx -; CHECK-I686-NEXT: popl %ebp ; CHECK-I686-NEXT: retl %v = fptrunc <4 x double> %a to <4 x half> store <4 x half> %v, <4 x half>* %p @@ -844,7 +711,7 @@ ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: callq test_floatret@PLT ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: popq %rax ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_f80trunc_nodagcombine: @@ -853,8 +720,8 @@ ; BWON-F16C-NEXT: callq test_floatret@PLT ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax -; BWON-F16C-NEXT: popq %rcx +; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; BWON-F16C-NEXT: popq %rax ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_f80trunc_nodagcombine: @@ -876,51 +743,73 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 { ; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $16, %rsp -; CHECK-LIBCALL-NEXT: movzwl (%rsi), %ebx -; CHECK-LIBCALL-NEXT: cvtsi2ss %edi, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi +; CHECK-LIBCALL-NEXT: subq $40, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rsi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: addl $-2147483648, %edi # imm = 0x80000000 +; CHECK-LIBCALL-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; CHECK-LIBCALL-NEXT: movl $1127219200, {{[0-9]+}}(%rsp) # imm = 0x43300000 +; CHECK-LIBCALL-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-LIBCALL-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebx, %edi +; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi -; CHECK-LIBCALL-NEXT: addq $16, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: addq $40, %rsp ; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; BWON-F16C-LABEL: test_sitofp_fadd_i32: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: movzwl (%rsi), %eax -; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; BWON-F16C-NEXT: addl $-2147483648, %edi # imm = 0x80000000 +; BWON-F16C-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; BWON-F16C-NEXT: movl $1127219200, -{{[0-9]+}}(%rsp) # imm = 0x43300000 +; BWON-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; BWON-F16C-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %ecx +; BWON-F16C-NEXT: movzwl %cx, %ecx +; BWON-F16C-NEXT: vmovd %ecx, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movzwl %ax, %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sitofp_fadd_i32: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %edi -; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $20, %esp +; CHECK-I686-NEXT: subl $76, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %edi -; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 -; CHECK-I686-NEXT: movss %xmm0, (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %si -; CHECK-I686-NEXT: movl %edi, (%esp) +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; CHECK-I686-NEXT: xorl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movl $1127219200, {{[0-9]+}}(%esp) # imm = 0x43300000 +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; CHECK-I686-NEXT: movsd %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movzwl %si, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) @@ -928,12 +817,10 @@ ; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movzwl %ax, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: addl $20, %esp -; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi +; CHECK-I686-NEXT: addl $76, %esp ; CHECK-I686-NEXT: retl %tmp0 = load half, half* %b %tmp1 = sitofp i32 %a to half @@ -946,47 +833,60 @@ ; CHECK-LIBCALL-LABEL: PR40273: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl %di, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: xorl %eax, %eax ; CHECK-LIBCALL-NEXT: xorps %xmm1, %xmm1 ; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: movl $15360, %ecx # imm = 0x3C00 -; CHECK-LIBCALL-NEXT: cmovnel %ecx, %eax -; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax -; CHECK-LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: jne .LBB17_3 +; CHECK-LIBCALL-NEXT: # %bb.1: +; CHECK-LIBCALL-NEXT: jp .LBB17_3 +; CHECK-LIBCALL-NEXT: # %bb.2: +; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: .LBB17_3: +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: popq %rax ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: PR40273: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl %di, %eax +; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax +; BWON-F16C-NEXT: movzwl %ax, %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: xorl %eax, %eax ; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 -; BWON-F16C-NEXT: movl $15360, %ecx # imm = 0x3C00 -; BWON-F16C-NEXT: cmovnel %ecx, %eax -; BWON-F16C-NEXT: cmovpl %ecx, %eax -; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax +; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; BWON-F16C-NEXT: jne .LBB17_3 +; BWON-F16C-NEXT: # %bb.1: +; BWON-F16C-NEXT: jp .LBB17_3 +; BWON-F16C-NEXT: # %bb.2: +; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: .LBB17_3: +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: PR40273: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp -; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: xorl %eax, %eax ; CHECK-I686-NEXT: xorps %xmm1, %xmm1 ; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-I686-NEXT: movl $15360, %ecx # imm = 0x3C00 -; CHECK-I686-NEXT: cmovnel %ecx, %eax -; CHECK-I686-NEXT: cmovpl %ecx, %eax -; CHECK-I686-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: jne .LBB17_3 +; CHECK-I686-NEXT: # %bb.1: +; CHECK-I686-NEXT: jp .LBB17_3 +; CHECK-I686-NEXT: # %bb.2: +; CHECK-I686-NEXT: xorps %xmm0, %xmm0 +; CHECK-I686-NEXT: .LBB17_3: +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %2 = fcmp une half %0, 0xH0000 diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -7,13 +7,16 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v1f16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: subl $28, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: movl %esi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstps {{[0-9]+}}(%esp) @@ -21,54 +24,60 @@ ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: addl $12, %esp -; X86-NEXT: popl %esi +; X86-NEXT: addl $28, %esp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v1f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: subq $16, %rsp -; X64-NEXT: movl %edi, %ebx -; X64-NEXT: movzwl %si, %edi -; X64-NEXT: callq __gnu_h2f_ieee@PLT +; X64-NEXT: pushq %rax ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %bx, %edi +; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: callq __gnu_h2f_ieee@PLT +; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: addq $16, %rsp -; X64-NEXT: popq %rbx +; X64-NEXT: popq %rax ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v1f16: ; F16C: # %bb.0: -; F16C-NEXT: movzwl %si, %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vpextrw $0, %xmm1, %ecx +; F16C-NEXT: movzwl %cx, %ecx +; F16C-NEXT: vmovd %ecx, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %di, %eax +; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v1f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: movw %si, %cx -; F16C-O0-NEXT: movw %di, %ax -; F16C-O0-NEXT: movzwl %cx, %ecx -; F16C-O0-NEXT: vmovd %ecx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 +; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm1 +; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax ; F16C-O0-NEXT: movzwl %ax, %eax ; F16C-O0-NEXT: vmovd %eax, %xmm0 ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-O0-NEXT: vmovd %xmm0, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm0 +; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; F16C-O0-NEXT: retq %retval = fadd <1 x half> %arg0, %arg1 ret <1 x half> %retval @@ -77,26 +86,30 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v2f16: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzwl 8(%ebp), %esi -; X86-NEXT: movzwl 12(%ebp), %edi -; X86-NEXT: movzwl 20(%ebp), %ebx -; X86-NEXT: movzwl 16(%ebp), %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: subl $80, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; X86-NEXT: movl %ebx, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; X86-NEXT: movl %edi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: movl %esi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) @@ -108,117 +121,113 @@ ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) -; X86-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: pextrw $1, %xmm0, %edx -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: # kill: def $dx killed $dx killed $edx -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movaps %xmm0, %xmm1 +; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; X86-NEXT: # xmm0 = mem[0],zero,zero,zero +; X86-NEXT: addl $80, %esp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v2f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rbp -; X64-NEXT: pushq %r14 -; X64-NEXT: pushq %rbx -; X64-NEXT: subq $32, %rsp -; X64-NEXT: movl %edx, %ebp -; X64-NEXT: movl %esi, %ebx -; X64-NEXT: movl %edi, %r14d -; X64-NEXT: movzwl %cx, %edi +; X64-NEXT: subq $24, %rsp +; X64-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %bx, %edi +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; X64-NEXT: movzwl %bp, %edi +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %r14w, %edi +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; X64-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: pextrw $1, %xmm0, %edx -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: # kill: def $dx killed $dx killed $edx -; X64-NEXT: addq $32, %rsp -; X64-NEXT: popq %rbx -; X64-NEXT: popq %r14 -; X64-NEXT: popq %rbp +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: addq $24, %rsp ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v2f16: ; F16C: # %bb.0: -; F16C-NEXT: movzwl %cx, %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpextrw $0, %xmm1, %eax +; F16C-NEXT: vpextrw $0, %xmm3, %ecx +; F16C-NEXT: vpextrw $0, %xmm0, %edx +; F16C-NEXT: vpextrw $0, %xmm2, %esi +; F16C-NEXT: movzwl %si, %esi +; F16C-NEXT: vmovd %esi, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %si, %eax -; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: movzwl %dx, %edx +; F16C-NEXT: vmovd %edx, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-NEXT: movzwl %dx, %eax -; F16C-NEXT: vmovd %eax, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %di, %eax -; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: vmovd %xmm0, %edx +; F16C-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; F16C-NEXT: movzwl %cx, %ecx +; F16C-NEXT: vmovd %ecx, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpextrw $1, %xmm0, %edx -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: # kill: def $dx killed $dx killed $edx +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm2 +; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: vmovd %xmm1, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v2f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: movl %esi, %eax -; F16C-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; F16C-O0-NEXT: movw %dx, %si +; F16C-O0-NEXT: vpextrw $0, %xmm2, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm2 +; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax ; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movw %di, %dx -; F16C-O0-NEXT: movzwl %si, %esi -; F16C-O0-NEXT: vmovd %esi, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 -; F16C-O0-NEXT: movzwl %dx, %edx -; F16C-O0-NEXT: vmovd %edx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-O0-NEXT: movzwl %cx, %ecx -; F16C-O0-NEXT: vmovd %ecx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 ; F16C-O0-NEXT: movzwl %ax, %eax ; F16C-O0-NEXT: vmovd %eax, %xmm0 ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; F16C-O0-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-O0-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; F16C-O0-NEXT: vmovd %xmm0, %eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm0 +; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-O0-NEXT: vpextrw $0, %xmm3, %eax ; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: vpextrw $1, %xmm0, %ecx -; F16C-O0-NEXT: movw %cx, %dx +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm2 +; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm1 +; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-O0-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; F16C-O0-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-O0-NEXT: vmovd %xmm1, %eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm1 +; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm1, %xmm1 ; F16C-O0-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/llvm/test/CodeGen/X86/pr38533.ll b/llvm/test/CodeGen/X86/pr38533.ll --- a/llvm/test/CodeGen/X86/pr38533.ll +++ b/llvm/test/CodeGen/X86/pr38533.ll @@ -1,23 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512fp16 | FileCheck %s --check-prefix=AVX512FP16 ; This test makes sure that a vector that needs to be promoted that is bitcasted to fp16 is legalized correctly without causing a width mismatch. define void @constant_fold_vector_to_half() { -; CHECK-LABEL: constant_fold_vector_to_half: -; CHECK: # %bb.0: -; CHECK-NEXT: movw $16384, (%rax) # imm = 0x4000 -; CHECK-NEXT: retq +; SSE2-LABEL: constant_fold_vector_to_half: +; SSE2: # %bb.0: +; SSE2-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; SSE2-NEXT: pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rax) +; SSE2-NEXT: retq +; +; AVX512-LABEL: constant_fold_vector_to_half: +; AVX512: # %bb.0: +; AVX512-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; AVX512-NEXT: vpinsrw $0, -{{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX512-NEXT: retq +; +; AVX512FP16-LABEL: constant_fold_vector_to_half: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; AVX512FP16-NEXT: vmovsh -{{[0-9]+}}(%rsp), %xmm0 +; AVX512FP16-NEXT: vmovsh %xmm0, (%rax) +; AVX512FP16-NEXT: retq store volatile half bitcast (<4 x i4> to half), half* undef ret void } ; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing. define void @pr38533_2(half %x) { -; CHECK-LABEL: pr38533_2: -; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rax) -; CHECK-NEXT: retq +; SSE2-LABEL: pr38533_2: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rax) +; SSE2-NEXT: retq +; +; AVX512-LABEL: pr38533_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX512-NEXT: retq +; +; AVX512FP16-LABEL: pr38533_2: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovsh %xmm0, (%rax) +; AVX512FP16-NEXT: retq %a = bitcast half %x to <4 x i4> store volatile <4 x i4> %a, <4 x i4>* undef ret void @@ -25,10 +54,21 @@ ; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized. define void @pr38533_3(half %x) { -; CHECK-LABEL: pr38533_3: -; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rax) -; CHECK-NEXT: retq +; SSE2-LABEL: pr38533_3: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rax) +; SSE2-NEXT: retq +; +; AVX512-LABEL: pr38533_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX512-NEXT: retq +; +; AVX512FP16-LABEL: pr38533_3: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovsh %xmm0, (%rax) +; AVX512FP16-NEXT: retq %a = bitcast half %x to <16 x i1> store volatile <16 x i1> %a, <16 x i1>* undef ret void diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll --- a/llvm/test/CodeGen/X86/pr47000.ll +++ b/llvm/test/CodeGen/X86/pr47000.ll @@ -7,54 +7,85 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { ; CHECK-LABEL: doTheTestMod: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $124, %esp +; CHECK-NEXT: # implicit-def: $xmm3 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm3 +; CHECK-NEXT: # implicit-def: $xmm2 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm2 +; CHECK-NEXT: # implicit-def: $xmm1 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: # implicit-def: $xmm4 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm4 +; CHECK-NEXT: # implicit-def: $xmm5 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm5 +; CHECK-NEXT: # implicit-def: $xmm6 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm6 +; CHECK-NEXT: # implicit-def: $xmm7 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm7 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %si -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %di -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bp -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax +; CHECK-NEXT: pextrw $0, %xmm7, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload -; CHECK-NEXT: movw %bp, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %di, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %si, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm6, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: pextrw $0, %xmm5, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm4, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm3, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm2, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm1, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: # implicit-def: $xmm1 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm1 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm1 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: # implicit-def: $xmm1 +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-NEXT: movl %esp, %eax @@ -65,15 +96,22 @@ ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-NEXT: movl %esp, %eax @@ -84,15 +122,22 @@ ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %si -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-NEXT: movl %esp, %eax @@ -103,15 +148,22 @@ ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %di -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, %cx ; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movw %cx, (%eax) ; CHECK-NEXT: calll __gnu_h2f_ieee ; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-NEXT: movl %esp, %eax @@ -122,19 +174,28 @@ ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Reload +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %bx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movw %bx, 6(%ecx) -; CHECK-NEXT: movw %di, 4(%ecx) -; CHECK-NEXT: movw %si, 2(%ecx) +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: pextrw $0, %xmm3, %edx +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-NEXT: movw %dx, 6(%ecx) +; CHECK-NEXT: pextrw $0, %xmm2, %edx +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-NEXT: movw %dx, 4(%ecx) +; CHECK-NEXT: pextrw $0, %xmm1, %edx +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-NEXT: movw %dx, 2(%ecx) +; CHECK-NEXT: pextrw $0, %xmm0, %edx +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-NEXT: movw %dx, (%ecx) ; CHECK-NEXT: addl $124, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: popl %ebx -; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl $4 Entry: %x = alloca <4 x half>, align 8 diff --git a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir --- a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir +++ b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir @@ -128,7 +128,7 @@ ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !4) ; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !4) ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2293769 /* reguse:GR32 */, [[MOV32rm]], 2293769 /* reguse:GR32 */, [[MOV32r0_]], 2293769 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2359305 /* reguse:GR32 */, [[MOV32rm]], 2359305 /* reguse:GR32 */, [[MOV32r0_]], 2359305 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8 ; CHECK-NEXT: MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !4) ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !9) ; CHECK-NEXT: OR8mi [[MOV32rm2]], 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4) @@ -143,7 +143,7 @@ %4:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !5) %6:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !5) %5:gr32 = MOV32r0 implicit-def dead $eflags - INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2293769 /* reguse:GR32 */, %4, 2293769 /* reguse:GR32 */, %5, 2293769 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9 + INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2359305 /* reguse:GR32 */, %4, 2359305 /* reguse:GR32 */, %5, 2359305 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9 MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !5) %7:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !10) OR8mi %7, 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4) diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,28 +4,30 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl 6(%rdi), %r8d -; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movq (%rsi), %rsi -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: pextrw $1, %xmm0, %r9d -; CHECK-NEXT: movd %xmm0, %r10d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm0, %edi -; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 4(%rdx) -; CHECK-NEXT: movw %r8w, 12(%rdx) -; CHECK-NEXT: movw %si, (%rdx) -; CHECK-NEXT: movw %di, 10(%rdx) +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1 +; CHECK-NEXT: pinsrw $0, 4(%rdi), %xmm2 +; CHECK-NEXT: pinsrw $0, 6(%rdi), %xmm3 +; CHECK-NEXT: pinsrw $0, (%rsi), %xmm4 +; CHECK-NEXT: pinsrw $0, 2(%rsi), %xmm5 +; CHECK-NEXT: pinsrw $0, 4(%rsi), %xmm6 +; CHECK-NEXT: pinsrw $0, 6(%rsi), %xmm7 +; CHECK-NEXT: pextrw $0, %xmm7, %eax ; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r9w, 6(%rdx) +; CHECK-NEXT: pextrw $0, %xmm3, %eax +; CHECK-NEXT: movw %ax, 12(%rdx) +; CHECK-NEXT: pextrw $0, %xmm6, %eax +; CHECK-NEXT: movw %ax, 10(%rdx) +; CHECK-NEXT: pextrw $0, %xmm2, %eax +; CHECK-NEXT: movw %ax, 8(%rdx) +; CHECK-NEXT: pextrw $0, %xmm5, %eax +; CHECK-NEXT: movw %ax, 6(%rdx) +; CHECK-NEXT: pextrw $0, %xmm1, %eax +; CHECK-NEXT: movw %ax, 4(%rdx) +; CHECK-NEXT: pextrw $0, %xmm4, %eax +; CHECK-NEXT: movw %ax, 2(%rdx) +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll @@ -803,7 +803,7 @@ define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd123sh: - ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2) ret half %2 @@ -812,7 +812,7 @@ define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd213sh: - ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2) ret half %2 @@ -820,7 +820,7 @@ define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd231sh: - ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0) ret half %2 @@ -828,7 +828,7 @@ define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd321sh: - ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0) ret half %2 @@ -836,7 +836,7 @@ define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd132sh: - ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1) ret half %2 @@ -844,7 +844,7 @@ define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmadd312sh: - ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1) ret half %2 @@ -852,7 +852,7 @@ define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub123sh: - ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2) @@ -861,7 +861,7 @@ define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub213sh: - ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2) @@ -870,7 +870,7 @@ define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub231sh: - ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2) @@ -879,7 +879,7 @@ define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub321sh: - ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2) @@ -888,7 +888,7 @@ define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub132sh: - ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2) @@ -897,7 +897,7 @@ define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fmsub312sh: - ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2) @@ -906,7 +906,7 @@ define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd123sh: - ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2) @@ -915,7 +915,7 @@ define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd213sh: - ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2) @@ -924,7 +924,7 @@ define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd231sh: - ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0) @@ -933,7 +933,7 @@ define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd321sh: - ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0) @@ -942,7 +942,7 @@ define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd132sh: - ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1) @@ -951,7 +951,7 @@ define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmadd312sh: - ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1) @@ -960,7 +960,7 @@ define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub123sh: - ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = fneg half %a2 @@ -970,7 +970,7 @@ define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub213sh: - ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = fneg half %a2 @@ -980,7 +980,7 @@ define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub231sh: - ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a1 %3 = fneg half %a0 @@ -990,7 +990,7 @@ define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub321sh: - ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = fneg half %a0 @@ -1000,7 +1000,7 @@ define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub132sh: - ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a0 %3 = fneg half %a1 @@ -1010,7 +1010,7 @@ define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) { ;CHECK-LABEL: stack_fold_fnmsub312sh: - ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fneg half %a2 %3 = fneg half %a1 diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -50,7 +50,7 @@ define half @stack_fold_addsh(half %a0, half %a1) { ;CHECK-LABEL: stack_fold_addsh - ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fadd half %a0, %a1 ret half %2 @@ -107,7 +107,7 @@ define half @stack_fold_divsh(half %a0, half %a1) { ;CHECK-LABEL: stack_fold_divsh - ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fdiv half %a0, %a1 ret half %2 @@ -390,7 +390,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 { ;CHECK-LABEL: stack_fold_maxsh: - ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp ogt half %a0, %a1 %3 = select i1 %2, half %a0, half %a1 @@ -399,7 +399,7 @@ define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 { ;CHECK-LABEL: stack_fold_maxsh_commuted: - ;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp ogt half %a1, %a0 %3 = select i1 %2, half %a1, half %a0 @@ -408,7 +408,7 @@ define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 { ;CHECK-LABEL: stack_fold_maxsh_commutable: - ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp ogt half %a0, %a1 %3 = select i1 %2, half %a0, half %a1 @@ -417,7 +417,7 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 { ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted: - ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp ogt half %a1, %a0 %3 = select i1 %2, half %a1, half %a0 @@ -569,7 +569,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 { ;CHECK-LABEL: stack_fold_minsh: - ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp olt half %a0, %a1 %3 = select i1 %2, half %a0, half %a1 @@ -578,7 +578,7 @@ define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 { ;CHECK-LABEL: stack_fold_minsh_commuted: - ;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp olt half %a1, %a0 %3 = select i1 %2, half %a1, half %a0 @@ -587,7 +587,7 @@ define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 { ;CHECK-LABEL: stack_fold_minsh_commutable: - ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp olt half %a0, %a1 %3 = select i1 %2, half %a0, half %a1 @@ -596,7 +596,7 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 { ;CHECK-LABEL: stack_fold_minsh_commutable_commuted: - ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fcmp olt half %a1, %a0 %3 = select i1 %2, half %a1, half %a0 @@ -671,7 +671,7 @@ define half @stack_fold_mulsh(half %a0, half %a1) { ;CHECK-LABEL: stack_fold_mulsh - ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fmul half %a0, %a1 ret half %2 @@ -972,7 +972,7 @@ define half @stack_fold_subsh(half %a0, half %a1) { ;CHECK-LABEL: stack_fold_subsh - ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = fsub half %a0, %a1 ret half %2 diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir @@ -340,7 +340,7 @@ ; CHECK: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags ; CHECK: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0 ; CHECK: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags - ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %102, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags ; CHECK: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK: $rdi = COPY [[COPY4]] @@ -456,7 +456,7 @@ %63:gr64 = NOT64r %63 CMP64rr %63, %31, implicit-def $eflags %63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags - INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %53, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %53, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp $rdi = COPY %64 diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2101,61 +2101,39 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %rbx -; SSE-NEXT: pushq %rax -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movzwl %di, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: cvttss2si %xmm0, %ebp -; SSE-NEXT: movzwl %bx, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: callq __fixhfsi@PLT +; SSE-NEXT: movl %eax, %ebx +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __fixhfsi@PLT ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: movd %ebx, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: addq $16, %rsp ; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f16_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: pushq %rbp -; VEX-NEXT: pushq %rbx -; VEX-NEXT: pushq %rax -; VEX-NEXT: movl %esi, %ebx -; VEX-NEXT: movzwl %di, %edi -; VEX-NEXT: callq __gnu_h2f_ieee@PLT -; VEX-NEXT: vcvttss2si %xmm0, %ebp -; VEX-NEXT: movzwl %bx, %edi -; VEX-NEXT: callq __gnu_h2f_ieee@PLT -; VEX-NEXT: vcvttss2si %xmm0, %eax -; VEX-NEXT: vmovd %eax, %xmm0 -; VEX-NEXT: vmovd %ebp, %xmm1 -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: addq $8, %rsp -; VEX-NEXT: popq %rbx -; VEX-NEXT: popq %rbp -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f16_to_4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %di, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %eax -; AVX512-NEXT: movzwl %si, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f16_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: callq __fixhfsi@PLT +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __fixhfsi@PLT +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebx, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -557,7 +557,8 @@ ; ALL-LABEL: store_cvt_f32_to_i16: ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, (%rdi) +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, (%rdi) ; ALL-NEXT: retq %1 = fptrunc float %a0 to half %2 = bitcast half %1 to i16 @@ -647,7 +648,11 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind { ; ALL-LABEL: cvt_f64_to_i16: ; ALL: # %bb.0: -; ALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: retq %1 = fptrunc double %a0 to half %2 = bitcast half %1 to i16 ret i16 %2 @@ -656,16 +661,16 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; ALL-LABEL: cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: subq $40, %rsp -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $40, %rsp +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -675,28 +680,27 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -706,28 +710,27 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -738,28 +741,27 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: addq $72, %rsp +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -770,205 +772,165 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX1-LABEL: cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r15d -; AVX1-NEXT: orl %ebx, %r15d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: movzwl %cx, %ecx +; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r14d -; AVX1-NEXT: orl %ebx, %r14d -; AVX1-NEXT: shlq $32, %r14 -; AVX1-NEXT: orq %r15, %r14 -; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r15d -; AVX1-NEXT: orl %ebx, %r15d -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %ebx, %eax +; AVX1-NEXT: orl %edx, %eax ; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %r15, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %r14, %xmm1 +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: movzwl %dx, %edx +; AVX1-NEXT: orl %ecx, %edx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: movzwl %si, %esi +; AVX1-NEXT: orl %ecx, %esi +; AVX1-NEXT: shlq $32, %rsi +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: addq $64, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r15d -; AVX2-NEXT: orl %ebx, %r15d -; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: shll $16, %eax +; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: movzwl %cx, %ecx +; AVX2-NEXT: orl %eax, %ecx ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r14d -; AVX2-NEXT: orl %ebx, %r14d -; AVX2-NEXT: shlq $32, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r15d -; AVX2-NEXT: orl %ebx, %r15d -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %edx +; AVX2-NEXT: shll $16, %edx +; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %ebx, %eax +; AVX2-NEXT: orl %edx, %eax ; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %r14, %xmm1 +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: shll $16, %ecx +; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: movzwl %dx, %edx +; AVX2-NEXT: orl %ecx, %edx +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: shll $16, %ecx +; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: movzwl %si, %esi +; AVX2-NEXT: orl %ecx, %esi +; AVX2-NEXT: shlq $32, %rsi +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: addq $64, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $80, %rsp -; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r15d -; AVX512-NEXT: orl %ebx, %r15d -; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r14d -; AVX512-NEXT: orl %ebx, %r14d -; AVX512-NEXT: shlq $32, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r15d -; AVX512-NEXT: orl %ebx, %r15d -; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: shll $16, %eax +; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: movzwl %cx, %ecx +; AVX512-NEXT: orl %eax, %ecx +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %edx +; AVX512-NEXT: shll $16, %edx +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax ; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: orl %ebx, %eax +; AVX512-NEXT: orl %edx, %eax ; AVX512-NEXT: shlq $32, %rax -; AVX512-NEXT: orq %r15, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %r14, %xmm1 +; AVX512-NEXT: orq %rcx, %rax +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: shll $16, %ecx +; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %edx +; AVX512-NEXT: movzwl %dx, %edx +; AVX512-NEXT: orl %ecx, %edx +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: shll $16, %ecx +; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: movzwl %si, %esi +; AVX512-NEXT: orl %ecx, %esi +; AVX512-NEXT: shlq $32, %rsi +; AVX512-NEXT: orq %rdx, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: addq $80, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -982,11 +944,10 @@ define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind { ; ALL-LABEL: store_cvt_f64_to_i16: ; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rbx) -; ALL-NEXT: popq %rbx +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, (%rdi) ; ALL-NEXT: retq %1 = fptrunc double %a0 to half %2 = bitcast half %1 to i16 @@ -997,21 +958,15 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $24, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movl %eax, %ebp -; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rbx) -; ALL-NEXT: movw %bp, 2(%rbx) -; ALL-NEXT: addq $24, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: popq %rbp +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %ecx +; ALL-NEXT: movw %cx, (%rdi) +; ALL-NEXT: movw %ax, 2(%rdi) ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -1020,119 +975,29 @@ } define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f64_to_4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $56, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, 4(%rbx) -; AVX1-NEXT: movw %bp, (%rbx) -; AVX1-NEXT: movw %r15w, 6(%rbx) -; AVX1-NEXT: movw %r14w, 2(%rbx) -; AVX1-NEXT: addq $56, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f64_to_4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $56, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, 4(%rbx) -; AVX2-NEXT: movw %bp, (%rbx) -; AVX2-NEXT: movw %r15w, 6(%rbx) -; AVX2-NEXT: movw %r14w, 2(%rbx) -; AVX2-NEXT: addq $56, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq -; -; AVX512-LABEL: store_cvt_4f64_to_4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $56, %rsp -; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, 4(%rbx) -; AVX512-NEXT: movw %bp, (%rbx) -; AVX512-NEXT: movw %r15w, 6(%rbx) -; AVX512-NEXT: movw %r14w, 2(%rbx) -; AVX512-NEXT: addq $56, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; ALL-LABEL: store_cvt_4f64_to_4i16: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %ecx +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %esi +; ALL-NEXT: movw %si, 4(%rdi) +; ALL-NEXT: movw %dx, (%rdi) +; ALL-NEXT: movw %cx, 6(%rdi) +; ALL-NEXT: movw %ax, 2(%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> store <4 x i16> %2, <4 x i16>* %a1 @@ -1142,32 +1007,28 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $64, %rsp -; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1179,32 +1040,28 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $64, %rsp -; ALL-NEXT: popq %rbx +; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1216,208 +1073,132 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX1-LABEL: store_cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $120, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %r8d +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX1-NEXT: vmovd %xmm3, %r9d +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX1-NEXT: vmovd %xmm3, %r10d +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX1-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX1-NEXT: vmovd %xmm4, %r11d +; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: movw %si, 12(%rdi) +; AVX1-NEXT: movw %dx, 8(%rdi) +; AVX1-NEXT: movw %cx, 4(%rdi) +; AVX1-NEXT: movw %ax, (%rdi) +; AVX1-NEXT: movw %r11w, 14(%rdi) +; AVX1-NEXT: movw %r10w, 10(%rdi) +; AVX1-NEXT: movw %r9w, 6(%rdi) +; AVX1-NEXT: movw %r8w, 2(%rdi) ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, 12(%rbx) -; AVX1-NEXT: movw %r15w, 8(%rbx) -; AVX1-NEXT: movw %r14w, 4(%rbx) -; AVX1-NEXT: movw %bp, (%rbx) -; AVX1-NEXT: movw %r13w, 14(%rbx) -; AVX1-NEXT: movw %r12w, 10(%rbx) -; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX1-NEXT: movw %ax, 6(%rbx) -; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX1-NEXT: movw %ax, 2(%rbx) -; AVX1-NEXT: addq $120, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $120, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %r8d +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX2-NEXT: vmovd %xmm3, %r9d +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX2-NEXT: vmovd %xmm3, %r10d +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX2-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX2-NEXT: vmovd %xmm4, %r11d +; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: movw %si, 12(%rdi) +; AVX2-NEXT: movw %dx, 8(%rdi) +; AVX2-NEXT: movw %cx, 4(%rdi) +; AVX2-NEXT: movw %ax, (%rdi) +; AVX2-NEXT: movw %r11w, 14(%rdi) +; AVX2-NEXT: movw %r10w, 10(%rdi) +; AVX2-NEXT: movw %r9w, 6(%rdi) +; AVX2-NEXT: movw %r8w, 2(%rdi) ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r12d -; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, 12(%rbx) -; AVX2-NEXT: movw %r15w, 8(%rbx) -; AVX2-NEXT: movw %r14w, 4(%rbx) -; AVX2-NEXT: movw %bp, (%rbx) -; AVX2-NEXT: movw %r13w, 14(%rbx) -; AVX2-NEXT: movw %r12w, 10(%rbx) -; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX2-NEXT: movw %ax, 6(%rbx) -; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX2-NEXT: movw %ax, 2(%rbx) -; AVX2-NEXT: addq $120, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp -; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r12d -; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %r8d +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %r9d +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %r10d +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %r11d +; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %edx +; AVX512-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: movw %si, 12(%rdi) +; AVX512-NEXT: movw %dx, 8(%rdi) +; AVX512-NEXT: movw %cx, 4(%rdi) +; AVX512-NEXT: movw %ax, (%rdi) +; AVX512-NEXT: movw %r11w, 14(%rdi) +; AVX512-NEXT: movw %r10w, 10(%rdi) +; AVX512-NEXT: movw %r9w, 6(%rdi) +; AVX512-NEXT: movw %r8w, 2(%rdi) ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, 12(%rbx) -; AVX512-NEXT: movw %r15w, 8(%rbx) -; AVX512-NEXT: movw %r14w, 4(%rbx) -; AVX512-NEXT: movw %bp, (%rbx) -; AVX512-NEXT: movw %r13w, 14(%rbx) -; AVX512-NEXT: movw %r12w, 10(%rbx) -; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 6(%rbx) -; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 2(%rbx) -; AVX512-NEXT: addq $152, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -367,70 +367,63 @@ define half @test_v2f16(<2 x half> %a0) nounwind { ; SSE-LABEL: test_v2f16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $16, %rsp -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movl %edi, %r14d -; SSE-NEXT: movzwl %bx, %ebp -; SSE-NEXT: movl %ebp, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT +; SSE-NEXT: pushq %rax ; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movzwl %r14w, %edi +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_h2f_ieee@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; SSE-NEXT: cmoval %r14d, %ebx -; SSE-NEXT: movw %bx, (%rsp) -; SSE-NEXT: movl (%rsp), %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: addq $16, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %rbp +; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: ja .LBB10_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: .LBB10_2: +; SSE-NEXT: callq __gnu_f2h_ieee@PLT +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: movl %esi, %ebx -; AVX-NEXT: movl %edi, %r14d -; AVX-NEXT: movzwl %bx, %ebp -; AVX-NEXT: movl %ebp, %edi -; AVX-NEXT: callq __gnu_h2f_ieee@PLT +; AVX-NEXT: pushq %rax ; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: movzwl %r14w, %edi +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_h2f_ieee@PLT +; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; AVX-NEXT: cmoval %r14d, %ebx -; AVX-NEXT: movw %bx, (%rsp) -; AVX-NEXT: movl (%rsp), %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: ja .LBB10_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB10_2: +; AVX-NEXT: callq __gnu_f2h_ieee@PLT +; AVX-NEXT: popq %rax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2f16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movzwl %si, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx +; AVX512BW-NEXT: movzwl %cx, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512BW-NEXT: movzwl %di, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: movzwl %ax, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 ; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 -; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: cmoval %edi, %esi -; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: seta %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512FP16-LABEL: test_v2f16: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -366,70 +366,63 @@ define half @test_v2f16(<2 x half> %a0) nounwind { ; SSE-LABEL: test_v2f16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $16, %rsp -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movl %edi, %r14d -; SSE-NEXT: movzwl %bx, %ebp -; SSE-NEXT: movl %ebp, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT +; SSE-NEXT: pushq %rax ; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movzwl %r14w, %edi +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_h2f_ieee@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; SSE-NEXT: cmovbl %r14d, %ebx -; SSE-NEXT: movw %bx, (%rsp) -; SSE-NEXT: movl (%rsp), %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: addq $16, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %rbp +; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: jb .LBB10_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: .LBB10_2: +; SSE-NEXT: callq __gnu_f2h_ieee@PLT +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: movl %esi, %ebx -; AVX-NEXT: movl %edi, %r14d -; AVX-NEXT: movzwl %bx, %ebp -; AVX-NEXT: movl %ebp, %edi -; AVX-NEXT: callq __gnu_h2f_ieee@PLT +; AVX-NEXT: pushq %rax ; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: movzwl %r14w, %edi +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_h2f_ieee@PLT +; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; AVX-NEXT: cmovbl %r14d, %ebx -; AVX-NEXT: movw %bx, (%rsp) -; AVX-NEXT: movl (%rsp), %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: jb .LBB10_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB10_2: +; AVX-NEXT: callq __gnu_f2h_ieee@PLT +; AVX-NEXT: popq %rax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2f16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movzwl %si, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx +; AVX512BW-NEXT: movzwl %cx, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512BW-NEXT: movzwl %di, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: movzwl %ax, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 ; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 -; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: cmovbl %edi, %esi -; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512FP16-LABEL: test_v2f16: diff --git a/llvm/test/MC/X86/x86_64-asm-match.s b/llvm/test/MC/X86/x86_64-asm-match.s --- a/llvm/test/MC/X86/x86_64-asm-match.s +++ b/llvm/test/MC/X86/x86_64-asm-match.s @@ -5,16 +5,16 @@ // CHECK: Trying to match opcode MMX_PSHUFBrr // CHECK: Matching formal operand class MCK_VR64 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode // CHECK: Trying to match opcode PSHUFBrr -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode // CHECK: Trying to match opcode PSHUFBrm // CHECK: Matching formal operand class MCK_Mem128 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 2 encodings with mnemonic 'sha1rnds4' // CHECK: Trying to match opcode SHA1RNDS4rri // CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:1): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm2): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm2): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 4 encodings with mnemonic 'pinsrw' // CHECK: Trying to match opcode MMX_PINSRWrr @@ -24,7 +24,7 @@ // CHECK: Trying to match opcode PINSRWrr // CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:3): match success using generic matcher // CHECK: Matching formal operand class MCK_GR32orGR64 against actual operand at index 2 (Reg:ecx): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm5): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm5): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 2 encodings with mnemonic 'crc32l' // CHECK: Trying to match opcode CRC32r32r32