diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -555,6 +555,39 @@ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + auto setF16Action = [&] (MVT VT, LegalizeAction Action) { + setOperationAction(ISD::FABS, VT, Action); + setOperationAction(ISD::FNEG, VT, Action); + setOperationAction(ISD::FCOPYSIGN, VT, Action); + setOperationAction(ISD::FREM, VT, Action); + setOperationAction(ISD::FMA, VT, Action); + setOperationAction(ISD::FMINNUM, VT, Action); + setOperationAction(ISD::FMAXNUM, VT, Action); + setOperationAction(ISD::FMINIMUM, VT, Action); + setOperationAction(ISD::FMAXIMUM, VT, Action); + setOperationAction(ISD::FSIN, VT, Action); + setOperationAction(ISD::FCOS, VT, Action); + setOperationAction(ISD::FSINCOS, VT, Action); + setOperationAction(ISD::FSQRT, VT, Action); + setOperationAction(ISD::FPOW, VT, Action); + setOperationAction(ISD::FLOG, VT, Action); + setOperationAction(ISD::FLOG2, VT, Action); + setOperationAction(ISD::FLOG10, VT, Action); + setOperationAction(ISD::FEXP, VT, Action); + setOperationAction(ISD::FEXP2, VT, Action); + setOperationAction(ISD::FCEIL, VT, Action); + setOperationAction(ISD::FFLOOR, VT, Action); + setOperationAction(ISD::FNEARBYINT, VT, Action); + setOperationAction(ISD::FRINT, VT, Action); + setOperationAction(ISD::BR_CC, VT, Action); + setOperationAction(ISD::SETCC, VT, Action); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SELECT_CC, VT, Action); + setOperationAction(ISD::FROUND, VT, Action); + setOperationAction(ISD::FROUNDEVEN, VT, Action); + setOperationAction(ISD::FTRUNC, VT, Action); + }; + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f16, f32 and f64 use SSE. // Set up the FP register classes. @@ -592,40 +625,11 @@ } // Half type will be promoted by default. - setOperationAction(ISD::FABS, MVT::f16, Promote); - setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setF16Action(MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::BR_CC, MVT::f16, Promote); - setOperationAction(ISD::SETCC, MVT::f16, Promote); - setOperationAction(ISD::SELECT, MVT::f16, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); @@ -975,6 +979,8 @@ : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); + addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass @@ -1056,7 +1062,7 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - for (auto VT : { MVT::v2f64, MVT::v2i64 }) { + for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1067,12 +1073,18 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); + setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); @@ -1272,6 +1284,8 @@ : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); + addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass @@ -1475,7 +1489,7 @@ // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v8f32, MVT::v4f64 }) { + MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1486,6 +1500,11 @@ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } + setF16Action(MVT::v16f16, Expand); + setOperationAction(ISD::FADD, MVT::v16f16, Expand); + setOperationAction(ISD::FSUB, MVT::v16f16, Expand); + setOperationAction(ISD::FMUL, MVT::v16f16, Expand); + setOperationAction(ISD::FDIV, MVT::v16f16, Expand); if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); @@ -1500,11 +1519,21 @@ } } - if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && + Subtarget.hasF16C()) { + for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + } + for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) { + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); + } + for (unsigned Opc : { ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV }) + setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); + + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); } // This block controls legalization of the mask vector sizes that are @@ -1587,6 +1616,7 @@ addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { @@ -1799,7 +1829,7 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, - MVT::v16f32, MVT::v8f64 }) { + MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); @@ -1810,6 +1840,13 @@ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v32f16, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + for (unsigned Opc : { ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV }) + setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); @@ -2020,7 +2057,6 @@ // AVX512_FP16 scalar operations setGroup(MVT::f16); - addRegisterClass(MVT::f16, &X86::FR16XRegClass); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); @@ -2034,6 +2070,7 @@ setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); @@ -2041,7 +2078,6 @@ if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); - addRegisterClass(MVT::v32f16, &X86::VR512RegClass); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); @@ -2080,8 +2116,6 @@ } if (Subtarget.hasVLX()) { - addRegisterClass(MVT::v8f16, &X86::VR128XRegClass); - addRegisterClass(MVT::v16f16, &X86::VR256XRegClass); setGroup(MVT::v8f16); setGroup(MVT::v16f16); @@ -2371,6 +2405,10 @@ !Subtarget.hasBWI()) return TypeSplitVector; + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && + !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + return TypeSplitVector; + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -2415,22 +2453,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return RegisterVT; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register type is f16. We override the type to v8f16 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return MVT::v8f16; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return MVT::v8f16; + } // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && @@ -2443,22 +2480,21 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return NumRegisters; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register number is 3. We override the number to 1 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return 1; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return 1; + } // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if // x87 is disabled. @@ -9605,13 +9641,13 @@ EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || - (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || + CVT == MVT::f16 || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) @@ -14068,6 +14104,16 @@ return ISD::isNON_EXTLoad(V.getNode()); } +template +static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { + return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16(); +} + +template +bool X86TargetLowering::isSoftFP16(T VT) const { + return ::isSoftFP16(VT, Subtarget); +} + /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower @@ -14079,6 +14125,9 @@ MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); + int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); @@ -19383,6 +19432,15 @@ SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + if (isSoftFP16(VT)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, + DAG.getBitcast(NVT, LHS), + DAG.getBitcast(NVT, RHS))); + } + // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && @@ -19406,8 +19464,6 @@ if (!Subtarget.hasSSE41()) return SDValue(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); @@ -20793,16 +20849,6 @@ return Cvt; } -template -static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { - return VT == MVT::f16 && !Subtarget.hasFP16(); -} - -template -bool X86TargetLowering::isSoftFP16(T VT) const { - return ::isSoftFP16(VT, Subtarget); -} - static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); @@ -22763,7 +22809,7 @@ return Op; if (SVT.getVectorElementType() == MVT::f16) { - assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); + assert(Subtarget.hasF16C() && "Unexpected features!"); if (SVT == MVT::v2f16) In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, DAG.getUNDEF(MVT::v2f16)); @@ -22798,27 +22844,31 @@ if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) return SDValue(); - if (VT == MVT::f16) { + if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT)) { if (Subtarget.hasFP16()) return Op; - if (SVT != MVT::f32) { + if (SVT.getScalarType() != MVT::f32) { + MVT TmpVT = + VT.isVector() ? SVT.changeVectorElementType(MVT::f32) : MVT::f32; if (IsStrict) return DAG.getNode( ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, {Chain, - DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other}, + DAG.getNode(ISD::STRICT_FP_ROUND, DL, {TmpVT, MVT::Other}, {Chain, In, Op2}), Op2}); return DAG.getNode(ISD::FP_ROUND, DL, VT, - DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2), - Op2); + DAG.getNode(ISD::FP_ROUND, DL, TmpVT, In, Op2), Op2); } if (!Subtarget.hasF16C()) return SDValue(); + if (VT.isVector()) + return Op; + SDValue Res; SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, MVT::i32); @@ -24113,10 +24163,10 @@ SDLoc dl(Op); if (isFP) { -#ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); -#endif + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); @@ -24678,6 +24728,9 @@ ISD::CondCode CC = cast(Op.getOperand(IsStrict ? 3 : 2))->get(); + if (isSoftFP16(Op0.getValueType())) + return SDValue(); + // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { @@ -32832,20 +32885,50 @@ case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { bool IsStrict = N->isStrictFPOpcode(); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); + SDValue Rnd = N->getOperand(IsStrict ? 2 : 1); + EVT SrcVT = Src.getValueType(); EVT VT = N->getValueType(0); - EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; + SDValue V; if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) : DAG.getUNDEF(MVT::v2f32); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); } + if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { + assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C"); + if (SrcVT == MVT::v2f64) { + if (IsStrict) + Src = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, + {MVT::v4f32, MVT::Other}, {Chain, Src}); + else + Src = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Src); + } else if (SrcVT == MVT::v4f64) { + if (IsStrict) + Src = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {MVT::v4f32, MVT::Other}, + {Chain, Src, Rnd}); + else + Src = DAG.getNode(ISD::FP_ROUND, dl, MVT::v4f32, Src, Rnd); + } + + if (IsStrict) + V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, + {Chain, Src, Rnd}); + else + V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd); + + Results.push_back(DAG.getBitcast(MVT::v8f16, V)); + if (IsStrict) + Results.push_back(V.getValue(1)); + return; + } if (!isTypeLegal(Src.getValueType())) return; - SDValue V; + EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, - {N->getOperand(0), Src}); + {Chain, Src}); else V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); Results.push_back(V); @@ -39549,11 +39632,6 @@ SmallVector Mask; unsigned Opcode = N.getOpcode(); - // FIXME: Remove this after we support vector FP16 - if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(), - Subtarget)) - return SDValue(); - if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; @@ -44612,7 +44690,7 @@ } // Early exit check - if (!TLI.isTypeLegal(VT)) + if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3769,12 +3769,16 @@ (VMOVDQA64Zrm addr:$src)>; def : Pat<(alignedloadv32i16 addr:$src), (VMOVDQA64Zrm addr:$src)>; + def : Pat<(alignedloadv32f16 addr:$src), + (VMOVAPSZrm addr:$src)>; def : Pat<(alignedloadv64i8 addr:$src), (VMOVDQA64Zrm addr:$src)>; def : Pat<(loadv16i32 addr:$src), (VMOVDQU64Zrm addr:$src)>; def : Pat<(loadv32i16 addr:$src), (VMOVDQU64Zrm addr:$src)>; + def : Pat<(loadv32f16 addr:$src), + (VMOVUPSZrm addr:$src)>; def : Pat<(loadv64i8 addr:$src), (VMOVDQU64Zrm addr:$src)>; @@ -3783,12 +3787,16 @@ (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v16i32 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v32i16 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32f16 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(store (v64i8 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; } @@ -3799,12 +3807,16 @@ (VMOVDQA64Z128rm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSZ128rm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQA64Z128rm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQU64Z128rm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQU64Z128rm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSZ128rm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQU64Z128rm addr:$src)>; @@ -3813,12 +3825,16 @@ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v4i32 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v8i16 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8f16 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; @@ -3827,12 +3843,16 @@ (VMOVDQA64Z256rm addr:$src)>; def : Pat<(alignedloadv16i16 addr:$src), (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSZ256rm addr:$src)>; def : Pat<(alignedloadv32i8 addr:$src), (VMOVDQA64Z256rm addr:$src)>; def : Pat<(loadv8i32 addr:$src), (VMOVDQU64Z256rm addr:$src)>; def : Pat<(loadv16i16 addr:$src), (VMOVDQU64Z256rm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSZ256rm addr:$src)>; def : Pat<(loadv32i8 addr:$src), (VMOVDQU64Z256rm addr:$src)>; @@ -3841,12 +3861,16 @@ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), + (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v8i32 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v16i16 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16f16 VR256X:$src), addr:$dst), + (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } @@ -3855,16 +3879,12 @@ (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>; - def : Pat<(v32f16 (alignedloadv32f16 addr:$src)), - (VMOVAPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (loadv32f16 addr:$src)), - (VMOVUPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; @@ -3878,10 +3898,6 @@ def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), - (VMOVAPSZmr addr:$dst, VR512:$src)>; - def : Pat<(store (v32f16 VR512:$src), addr:$dst), - (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } @@ -3890,16 +3906,12 @@ (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>; - def : Pat<(v16f16 (alignedloadv16f16 addr:$src)), - (VMOVAPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (loadv16f16 addr:$src)), - (VMOVUPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; @@ -3913,10 +3925,6 @@ def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), - (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; - def : Pat<(store (v16f16 VR256X:$src), addr:$dst), - (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask), (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>; @@ -3924,16 +3932,12 @@ (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>; - def : Pat<(v8f16 (alignedloadv8f16 addr:$src)), - (VMOVAPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (loadv8f16 addr:$src)), - (VMOVUPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; @@ -3947,10 +3951,6 @@ def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), - (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; - def : Pat<(store (v8f16 VR128X:$src), addr:$dst), - (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask), (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>; } diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -140,6 +140,7 @@ let Predicates = [NoAVX512] in { def : Pat<(v16i8 immAllZerosV), (V_SET0)>; def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v8f16 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; def : Pat<(v2i64 immAllZerosV), (V_SET0)>; def : Pat<(v2f64 immAllZerosV), (V_SET0)>; @@ -159,6 +160,7 @@ let Predicates = [NoAVX512] in { def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; @@ -568,10 +570,27 @@ (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v8i32 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; - def : Pat<(store (v16i16 VR256:$src), addr:$dst), + def : Pat<(store (v16f16 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -613,6 +632,17 @@ (MOVUPSmr addr:$dst, VR128:$src)>; } +let Predicates = [HasSSE2] in { + def : Pat<(alignedloadv8f16 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -3136,6 +3166,8 @@ (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; @@ -3143,6 +3175,8 @@ (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3152,6 +3186,8 @@ (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3374,12 +3410,16 @@ (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQArm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQUrm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQUrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVDQUrm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQUrm addr:$src)>; @@ -3387,12 +3427,16 @@ (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(store (v4i32 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v8i16 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v16i8 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; } @@ -6431,6 +6475,8 @@ (VMOVNTDQAYrm addr:$src)>; def : Pat<(v16i16 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v16f16 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; def : Pat<(v32i8 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; } @@ -6446,6 +6492,8 @@ (VMOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; } @@ -6461,6 +6509,8 @@ (MOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; } @@ -7050,6 +7100,8 @@ (VBROADCASTF128 addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; +def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; } @@ -7095,6 +7147,7 @@ defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; } @@ -7150,6 +7203,8 @@ defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7189,6 +7244,8 @@ defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; } @@ -7503,6 +7560,10 @@ (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), @@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; @@ -7759,6 +7823,8 @@ defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; + defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; } @@ -7781,6 +7847,8 @@ defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7801,6 +7869,8 @@ defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,29 +1297,6 @@ LT.first = NumOfDests * NumOfShufflesPerDest; } - static const CostTblEntry AVX512FP16ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw - - {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w - }; - - if (!ST->useSoftFloat() && ST->hasFP16()) - if (const auto *Entry = - CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; - static const CostTblEntry AVX512VBMIShuffleTbl[] = { {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb @@ -1339,17 +1316,22 @@ static const CostTblEntry AVX512BWShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 @@ -1369,6 +1351,7 @@ {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd @@ -1376,6 +1359,7 @@ {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca + {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd @@ -1408,11 +1392,14 @@ // FIXME: This just applies the type legalization cost rules above // assuming these completely split. {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14}, {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42}, {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq + {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps @@ -1430,6 +1417,7 @@ {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd @@ -1437,9 +1425,11 @@ {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb + {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb + {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd @@ -1448,6 +1438,8 @@ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb + // + vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb @@ -1457,6 +1449,8 @@ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb }; @@ -1493,6 +1487,7 @@ {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd @@ -1501,6 +1496,8 @@ {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb // + vinsertf128 + {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb + // + vinsertf128 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb // + vinsertf128 @@ -1509,6 +1506,7 @@ {TTI::SK_Select, MVT::v8i32, 1}, // vblendps {TTI::SK_Select, MVT::v8f32, 1}, // vblendps {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor + {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd @@ -1517,6 +1515,8 @@ {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 @@ -1526,6 +1526,8 @@ {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 }; @@ -1540,6 +1542,7 @@ {TTI::SK_Select, MVT::v4i32, 1}, // pblendw {TTI::SK_Select, MVT::v4f32, 1}, // blendps {TTI::SK_Select, MVT::v8i16, 1}, // pblendw + {TTI::SK_Select, MVT::v8f16, 1}, // pblendw {TTI::SK_Select, MVT::v16i8, 1} // pblendvb }; @@ -1549,18 +1552,23 @@ static const CostTblEntry SSSE3ShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb + {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb + {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por }; @@ -1573,12 +1581,14 @@ {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus @@ -1586,6 +1596,7 @@ {TTI::SK_Select, MVT::v2f64, 1}, // movsd {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por + {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd @@ -1593,6 +1604,8 @@ {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw // + pshufd/unpck + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw + // + pshufd/unpck { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + 2*packus @@ -1600,6 +1613,7 @@ { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute + { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; @@ -5219,7 +5233,7 @@ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; - if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) + if (ScalarTy->isHalfTy() && ST->hasBWI()) return true; if (!ScalarTy->isIntegerTy()) @@ -5641,8 +5655,7 @@ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || - (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) return HasBW; return false; }; diff --git a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll --- a/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll +++ b/llvm/test/Analysis/CostModel/X86/fptoi_sat.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 -; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ ; ; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM ; RUN: opt < %s -mtriple=x86_64-apple-darwin -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 @@ -877,26 +877,26 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 185 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'fp16' @@ -930,81 +930,28 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 185 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX1-LABEL: 'fp16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; ; AVX2-LABEL: 'fp16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) @@ -1036,80 +983,133 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 170 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512-LABEL: 'fp16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512F-LABEL: 'fp16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 155 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 218 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 185 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'fp16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 155 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 158 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'fp16' ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) @@ -1142,26 +1142,26 @@ ; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 185 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 142 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll @@ -65,12 +65,12 @@ ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 @@ -128,12 +128,12 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 @@ -191,12 +191,12 @@ ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 @@ -254,12 +254,12 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 @@ -317,12 +317,12 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 @@ -380,12 +380,12 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll @@ -3,8 +3,8 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024, <8 x half> %src128_1, <16 x half> %src256_1, <32 x half> %src512_1, <64 x half> %src1024_1) { ; CHECK-LABEL: 'test_vXf16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -2226,94 +2226,83 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) { ; KNL-LABEL: test_concat_v2i1: ; KNL: ## %bb.0: -; KNL-NEXT: movzwl 2(%rdi), %eax -; KNL-NEXT: movzwl (%rdi), %ecx -; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: vpextrw $0, %xmm0, %eax +; KNL-NEXT: movzwl %ax, %eax +; KNL-NEXT: vmovd %eax, %xmm1 +; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 +; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; KNL-NEXT: vucomiss %xmm2, %xmm1 +; KNL-NEXT: setb %al +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: vpsrld $16, %xmm0, %xmm0 +; KNL-NEXT: vpextrw $0, %xmm0, %eax +; KNL-NEXT: movzwl %ax, %eax +; KNL-NEXT: vmovd %eax, %xmm0 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 -; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vucomiss %xmm1, %xmm0 -; KNL-NEXT: setb %cl -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: vmovd %eax, %xmm2 -; KNL-NEXT: vcvtph2ps %xmm2, %xmm2 -; KNL-NEXT: vucomiss %xmm1, %xmm2 +; KNL-NEXT: vucomiss %xmm2, %xmm0 ; KNL-NEXT: setb %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vucomiss %xmm1, %xmm0 +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vucomiss %xmm2, %xmm1 ; KNL-NEXT: seta %al ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vucomiss %xmm1, %xmm2 +; KNL-NEXT: vucomiss %xmm2, %xmm0 ; KNL-NEXT: seta %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $1, %k2, %k2 ; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi -; KNL-NEXT: movzwl 2(%rsi), %eax -; KNL-NEXT: xorl %ecx, %ecx -; KNL-NEXT: testb $1, %dil -; KNL-NEXT: cmovel %ecx, %eax -; KNL-NEXT: kmovw %k0, %edi -; KNL-NEXT: testb $1, %dil -; KNL-NEXT: je LBB85_2 -; KNL-NEXT: ## %bb.1: -; KNL-NEXT: movl (%rsi), %ecx -; KNL-NEXT: LBB85_2: -; KNL-NEXT: movw %cx, (%rdx) -; KNL-NEXT: movw %ax, 2(%rdx) +; KNL-NEXT: kandw %k1, %k0, %k1 +; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm1, %ymm1 +; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vmovd %xmm0, (%rdx) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_concat_v2i1: ; SKX: ## %bb.0: -; SKX-NEXT: movzwl (%rdi), %eax -; SKX-NEXT: movzwl 2(%rdi), %ecx -; SKX-NEXT: vmovd %ecx, %xmm0 -; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 -; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SKX-NEXT: vucomiss %xmm1, %xmm0 -; SKX-NEXT: setb %cl -; SKX-NEXT: kmovd %ecx, %k0 +; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: vpsrld $16, %xmm0, %xmm1 +; SKX-NEXT: vpextrw $0, %xmm1, %eax +; SKX-NEXT: movzwl %ax, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 +; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX-NEXT: vucomiss %xmm2, %xmm1 +; SKX-NEXT: setb %al +; SKX-NEXT: kmovd %eax, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: vmovd %eax, %xmm2 -; SKX-NEXT: vcvtph2ps %xmm2, %xmm2 -; SKX-NEXT: vucomiss %xmm1, %xmm2 +; SKX-NEXT: vpextrw $0, %xmm0, %eax +; SKX-NEXT: movzwl %ax, %eax +; SKX-NEXT: vmovd %eax, %xmm0 +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; SKX-NEXT: vucomiss %xmm2, %xmm0 ; SKX-NEXT: setb %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korw %k0, %k1, %k0 -; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vucomiss %xmm1, %xmm0 +; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vucomiss %xmm2, %xmm1 ; SKX-NEXT: seta %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $1, %k1, %k1 -; SKX-NEXT: vucomiss %xmm1, %xmm2 +; SKX-NEXT: vucomiss %xmm2, %xmm0 ; SKX-NEXT: seta %al ; SKX-NEXT: kmovd %eax, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $7, %k2, %k2 ; SKX-NEXT: korw %k1, %k2, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edi -; SKX-NEXT: movzwl 2(%rsi), %eax -; SKX-NEXT: xorl %ecx, %ecx -; SKX-NEXT: testb $1, %dil -; SKX-NEXT: cmovel %ecx, %eax -; SKX-NEXT: kmovd %k0, %edi -; SKX-NEXT: testb $1, %dil -; SKX-NEXT: je LBB85_2 -; SKX-NEXT: ## %bb.1: -; SKX-NEXT: movl (%rsi), %ecx -; SKX-NEXT: LBB85_2: -; SKX-NEXT: movw %cx, (%rdx) -; SKX-NEXT: movw %ax, 2(%rdx) +; SKX-NEXT: kandw %k1, %k0, %k1 +; SKX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vmovd %xmm0, (%rdx) ; SKX-NEXT: retq %tmp = load <2 x half>, <2 x half>* %arg, align 8 %tmp3 = fcmp fast olt <2 x half> %tmp, diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll --- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -156,153 +156,10 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr) { ; CHECK-LABEL: test_mask_load_16xf16: ; CHECK: ## %bb.0: -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovmskb %xmm0, %ecx -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je LBB12_1 -; CHECK-NEXT: ## %bb.2: ## %cond.load -; CHECK-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm8 -; CHECK-NEXT: jmp LBB12_3 -; CHECK-NEXT: LBB12_1: -; CHECK-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; CHECK-NEXT: LBB12_3: ## %else -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; CHECK-NEXT: vmovdqa %xmm2, %xmm10 -; CHECK-NEXT: vmovdqa %xmm2, %xmm4 -; CHECK-NEXT: vmovdqa %xmm2, %xmm5 -; CHECK-NEXT: vmovdqa %xmm2, %xmm6 -; CHECK-NEXT: vmovdqa %xmm2, %xmm7 -; CHECK-NEXT: vmovdqa %xmm2, %xmm1 -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: vmovdqa %xmm2, %xmm3 -; CHECK-NEXT: vmovdqa %xmm2, %xmm11 -; CHECK-NEXT: vmovdqa %xmm2, %xmm12 -; CHECK-NEXT: vmovdqa %xmm2, %xmm13 -; CHECK-NEXT: vmovdqa %xmm2, %xmm14 -; CHECK-NEXT: testb $2, %cl -; CHECK-NEXT: je LBB12_4 -; CHECK-NEXT: ## %bb.5: ## %cond.load1 -; CHECK-NEXT: vmovdqa %xmm2, %xmm15 -; CHECK-NEXT: vpinsrw $0, 2(%rsi), %xmm0, %xmm2 -; CHECK-NEXT: testb $4, %cl -; CHECK-NEXT: jne LBB12_7 -; CHECK-NEXT: jmp LBB12_8 -; CHECK-NEXT: LBB12_4: -; CHECK-NEXT: vmovdqa %xmm2, %xmm15 -; CHECK-NEXT: testb $4, %cl -; CHECK-NEXT: je LBB12_8 -; CHECK-NEXT: LBB12_7: ## %cond.load4 -; CHECK-NEXT: vpinsrw $0, 4(%rsi), %xmm0, %xmm10 -; CHECK-NEXT: LBB12_8: ## %else5 -; CHECK-NEXT: testb $8, %cl -; CHECK-NEXT: jne LBB12_9 -; CHECK-NEXT: ## %bb.10: ## %else8 -; CHECK-NEXT: testb $16, %cl -; CHECK-NEXT: jne LBB12_11 -; CHECK-NEXT: LBB12_12: ## %else11 -; CHECK-NEXT: testb $32, %cl -; CHECK-NEXT: jne LBB12_13 -; CHECK-NEXT: LBB12_14: ## %else14 -; CHECK-NEXT: testb $64, %cl -; CHECK-NEXT: jne LBB12_15 -; CHECK-NEXT: LBB12_16: ## %else17 -; CHECK-NEXT: testb $-128, %cl -; CHECK-NEXT: jne LBB12_17 -; CHECK-NEXT: LBB12_18: ## %else20 -; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 -; CHECK-NEXT: jne LBB12_19 -; CHECK-NEXT: LBB12_20: ## %else23 -; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 -; CHECK-NEXT: jne LBB12_21 -; CHECK-NEXT: LBB12_22: ## %else26 -; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 -; CHECK-NEXT: jne LBB12_23 -; CHECK-NEXT: LBB12_24: ## %else29 -; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 -; CHECK-NEXT: jne LBB12_25 -; CHECK-NEXT: LBB12_26: ## %else32 -; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 -; CHECK-NEXT: jne LBB12_27 -; CHECK-NEXT: LBB12_28: ## %else35 -; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 -; CHECK-NEXT: jne LBB12_29 -; CHECK-NEXT: LBB12_30: ## %else38 -; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 -; CHECK-NEXT: jne LBB12_31 -; CHECK-NEXT: LBB12_32: ## %else41 -; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 -; CHECK-NEXT: je LBB12_34 -; CHECK-NEXT: LBB12_33: ## %cond.load43 -; CHECK-NEXT: vpinsrw $0, 30(%rsi), %xmm0, %xmm9 -; CHECK-NEXT: LBB12_34: ## %else44 -; CHECK-NEXT: vpextrw $0, %xmm8, (%rax) -; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rax) -; CHECK-NEXT: vpextrw $0, %xmm10, 4(%rax) -; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rax) -; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rax) -; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rax) -; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rax) -; CHECK-NEXT: vpextrw $0, %xmm1, 14(%rax) -; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rax) -; CHECK-NEXT: vpextrw $0, %xmm3, 18(%rax) -; CHECK-NEXT: vpextrw $0, %xmm11, 20(%rax) -; CHECK-NEXT: vpextrw $0, %xmm12, 22(%rax) -; CHECK-NEXT: vpextrw $0, %xmm13, 24(%rax) -; CHECK-NEXT: vpextrw $0, %xmm14, 26(%rax) -; CHECK-NEXT: vpextrw $0, %xmm15, 28(%rax) -; CHECK-NEXT: vpextrw $0, %xmm9, 30(%rax) +; CHECK-NEXT: vpmovb2m %xmm0, %k1 +; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; CHECK-NEXT: retq -; CHECK-NEXT: LBB12_9: ## %cond.load7 -; CHECK-NEXT: vpinsrw $0, 6(%rsi), %xmm0, %xmm4 -; CHECK-NEXT: testb $16, %cl -; CHECK-NEXT: je LBB12_12 -; CHECK-NEXT: LBB12_11: ## %cond.load10 -; CHECK-NEXT: vpinsrw $0, 8(%rsi), %xmm0, %xmm5 -; CHECK-NEXT: testb $32, %cl -; CHECK-NEXT: je LBB12_14 -; CHECK-NEXT: LBB12_13: ## %cond.load13 -; CHECK-NEXT: vpinsrw $0, 10(%rsi), %xmm0, %xmm6 -; CHECK-NEXT: testb $64, %cl -; CHECK-NEXT: je LBB12_16 -; CHECK-NEXT: LBB12_15: ## %cond.load16 -; CHECK-NEXT: vpinsrw $0, 12(%rsi), %xmm0, %xmm7 -; CHECK-NEXT: testb $-128, %cl -; CHECK-NEXT: je LBB12_18 -; CHECK-NEXT: LBB12_17: ## %cond.load19 -; CHECK-NEXT: vpinsrw $0, 14(%rsi), %xmm0, %xmm1 -; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 -; CHECK-NEXT: je LBB12_20 -; CHECK-NEXT: LBB12_19: ## %cond.load22 -; CHECK-NEXT: vpinsrw $0, 16(%rsi), %xmm0, %xmm0 -; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 -; CHECK-NEXT: je LBB12_22 -; CHECK-NEXT: LBB12_21: ## %cond.load25 -; CHECK-NEXT: vpinsrw $0, 18(%rsi), %xmm0, %xmm3 -; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 -; CHECK-NEXT: je LBB12_24 -; CHECK-NEXT: LBB12_23: ## %cond.load28 -; CHECK-NEXT: vpinsrw $0, 20(%rsi), %xmm0, %xmm11 -; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 -; CHECK-NEXT: je LBB12_26 -; CHECK-NEXT: LBB12_25: ## %cond.load31 -; CHECK-NEXT: vpinsrw $0, 22(%rsi), %xmm0, %xmm12 -; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 -; CHECK-NEXT: je LBB12_28 -; CHECK-NEXT: LBB12_27: ## %cond.load34 -; CHECK-NEXT: vpinsrw $0, 24(%rsi), %xmm0, %xmm13 -; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 -; CHECK-NEXT: je LBB12_30 -; CHECK-NEXT: LBB12_29: ## %cond.load37 -; CHECK-NEXT: vpinsrw $0, 26(%rsi), %xmm0, %xmm14 -; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 -; CHECK-NEXT: je LBB12_32 -; CHECK-NEXT: LBB12_31: ## %cond.load40 -; CHECK-NEXT: vpinsrw $0, 28(%rsi), %xmm0, %xmm15 -; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 -; CHECK-NEXT: jne LBB12_33 -; CHECK-NEXT: jmp LBB12_34 %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer) ret <16 x half> %res } @@ -313,127 +170,9 @@ ; CHECK-LABEL: test_mask_store_16xf16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovmskb %xmm0, %eax -; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: jne LBB13_1 -; CHECK-NEXT: ## %bb.2: ## %else -; CHECK-NEXT: testb $2, %al -; CHECK-NEXT: jne LBB13_3 -; CHECK-NEXT: LBB13_4: ## %else2 -; CHECK-NEXT: testb $4, %al -; CHECK-NEXT: jne LBB13_5 -; CHECK-NEXT: LBB13_6: ## %else4 -; CHECK-NEXT: testb $8, %al -; CHECK-NEXT: jne LBB13_7 -; CHECK-NEXT: LBB13_8: ## %else6 -; CHECK-NEXT: testb $16, %al -; CHECK-NEXT: jne LBB13_9 -; CHECK-NEXT: LBB13_10: ## %else8 -; CHECK-NEXT: testb $32, %al -; CHECK-NEXT: jne LBB13_11 -; CHECK-NEXT: LBB13_12: ## %else10 -; CHECK-NEXT: testb $64, %al -; CHECK-NEXT: jne LBB13_13 -; CHECK-NEXT: LBB13_14: ## %else12 -; CHECK-NEXT: testb $-128, %al -; CHECK-NEXT: jne LBB13_15 -; CHECK-NEXT: LBB13_16: ## %else14 -; CHECK-NEXT: testl $256, %eax ## imm = 0x100 -; CHECK-NEXT: jne LBB13_17 -; CHECK-NEXT: LBB13_18: ## %else16 -; CHECK-NEXT: testl $512, %eax ## imm = 0x200 -; CHECK-NEXT: jne LBB13_19 -; CHECK-NEXT: LBB13_20: ## %else18 -; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 -; CHECK-NEXT: jne LBB13_21 -; CHECK-NEXT: LBB13_22: ## %else20 -; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 -; CHECK-NEXT: jne LBB13_23 -; CHECK-NEXT: LBB13_24: ## %else22 -; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 -; CHECK-NEXT: jne LBB13_25 -; CHECK-NEXT: LBB13_26: ## %else24 -; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 -; CHECK-NEXT: jne LBB13_27 -; CHECK-NEXT: LBB13_28: ## %else26 -; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 -; CHECK-NEXT: jne LBB13_29 -; CHECK-NEXT: LBB13_30: ## %else28 -; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 -; CHECK-NEXT: jne LBB13_31 -; CHECK-NEXT: LBB13_32: ## %else30 -; CHECK-NEXT: retq -; CHECK-NEXT: LBB13_1: ## %cond.store -; CHECK-NEXT: vpextrw $0, %xmm1, (%rdi) -; CHECK-NEXT: testb $2, %al -; CHECK-NEXT: je LBB13_4 -; CHECK-NEXT: LBB13_3: ## %cond.store1 -; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rdi) -; CHECK-NEXT: testb $4, %al -; CHECK-NEXT: je LBB13_6 -; CHECK-NEXT: LBB13_5: ## %cond.store3 -; CHECK-NEXT: vpextrw $0, %xmm3, 4(%rdi) -; CHECK-NEXT: testb $8, %al -; CHECK-NEXT: je LBB13_8 -; CHECK-NEXT: LBB13_7: ## %cond.store5 -; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rdi) -; CHECK-NEXT: testb $16, %al -; CHECK-NEXT: je LBB13_10 -; CHECK-NEXT: LBB13_9: ## %cond.store7 -; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rdi) -; CHECK-NEXT: testb $32, %al -; CHECK-NEXT: je LBB13_12 -; CHECK-NEXT: LBB13_11: ## %cond.store9 -; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rdi) -; CHECK-NEXT: testb $64, %al -; CHECK-NEXT: je LBB13_14 -; CHECK-NEXT: LBB13_13: ## %cond.store11 -; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rdi) -; CHECK-NEXT: testb $-128, %al -; CHECK-NEXT: je LBB13_16 -; CHECK-NEXT: LBB13_15: ## %cond.store13 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 14(%rdi) -; CHECK-NEXT: testl $256, %eax ## imm = 0x100 -; CHECK-NEXT: je LBB13_18 -; CHECK-NEXT: LBB13_17: ## %cond.store15 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rdi) -; CHECK-NEXT: testl $512, %eax ## imm = 0x200 -; CHECK-NEXT: je LBB13_20 -; CHECK-NEXT: LBB13_19: ## %cond.store17 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 18(%rdi) -; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 -; CHECK-NEXT: je LBB13_22 -; CHECK-NEXT: LBB13_21: ## %cond.store19 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 20(%rdi) -; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 -; CHECK-NEXT: je LBB13_24 -; CHECK-NEXT: LBB13_23: ## %cond.store21 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 22(%rdi) -; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 -; CHECK-NEXT: je LBB13_26 -; CHECK-NEXT: LBB13_25: ## %cond.store23 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 24(%rdi) -; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 -; CHECK-NEXT: je LBB13_28 -; CHECK-NEXT: LBB13_27: ## %cond.store25 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 26(%rdi) -; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 -; CHECK-NEXT: je LBB13_30 -; CHECK-NEXT: LBB13_29: ## %cond.store27 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 28(%rdi) -; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 -; CHECK-NEXT: je LBB13_32 -; CHECK-NEXT: LBB13_31: ## %cond.store29 -; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vpextrw $0, %xmm0, 30(%rdi) +; CHECK-NEXT: vpmovb2m %xmm0, %k1 +; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) ret void diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1434,92 +1434,107 @@ define void @half_vec_compare(<2 x half>* %x, <2 x i8>* %y) { ; KNL-LABEL: half_vec_compare: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: movzwl 2(%rdi), %eax ## encoding: [0x0f,0xb7,0x47,0x02] -; KNL-NEXT: movzwl (%rdi), %ecx ## encoding: [0x0f,0xb7,0x0f] -; KNL-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; KNL-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] +; KNL-NEXT: ## xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10] +; KNL-NEXT: vpextrw $0, %xmm1, %eax ## encoding: [0xc5,0xf9,0xc5,0xc1,0x00] +; KNL-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0] +; KNL-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] +; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9] +; KNL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] +; KNL-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] +; KNL-NEXT: movl $65535, %ecx ## encoding: [0xb9,0xff,0xff,0x00,0x00] +; KNL-NEXT: ## imm = 0xFFFF +; KNL-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00] +; KNL-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1] +; KNL-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1] +; KNL-NEXT: vpextrw $0, %xmm0, %edi ## encoding: [0xc5,0xf9,0xc5,0xf8,0x00] +; KNL-NEXT: movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff] +; KNL-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; KNL-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] -; KNL-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] -; KNL-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] -; KNL-NEXT: andl $1, %edx ## encoding: [0x83,0xe2,0x01] -; KNL-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2] +; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] +; KNL-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1] +; KNL-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1] ; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; KNL-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; KNL-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] -; KNL-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9] -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] -; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] -; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] +; KNL-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc2,0x01] +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] ; KNL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; KNL-NEXT: vpextrw $0, %xmm0, (%rsi) ## encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] -; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; KNL-NEXT: retq ## encoding: [0xc3] ; ; AVX512BW-LABEL: half_vec_compare: ; AVX512BW: ## %bb.0: ## %entry -; AVX512BW-NEXT: movzwl 2(%rdi), %eax ## encoding: [0x0f,0xb7,0x47,0x02] -; AVX512BW-NEXT: movzwl (%rdi), %ecx ## encoding: [0x0f,0xb7,0x0f] -; AVX512BW-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; AVX512BW-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] +; AVX512BW-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10] +; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00] +; AVX512BW-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0] +; AVX512BW-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9] +; AVX512BW-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] +; AVX512BW-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] +; AVX512BW-NEXT: movl $65535, %ecx ## encoding: [0xb9,0xff,0xff,0x00,0x00] +; AVX512BW-NEXT: ## imm = 0xFFFF +; AVX512BW-NEXT: movl $0, %edx ## encoding: [0xba,0x00,0x00,0x00,0x00] +; AVX512BW-NEXT: cmovnel %ecx, %edx ## encoding: [0x0f,0x45,0xd1] +; AVX512BW-NEXT: cmovpl %ecx, %edx ## encoding: [0x0f,0x4a,0xd1] +; AVX512BW-NEXT: vpextrw $0, %xmm0, %edi ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xf8,0x00] +; AVX512BW-NEXT: movzwl %di, %edi ## encoding: [0x0f,0xb7,0xff] +; AVX512BW-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX512BW-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] -; AVX512BW-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] -; AVX512BW-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] -; AVX512BW-NEXT: andl $1, %edx ## encoding: [0x83,0xe2,0x01] -; AVX512BW-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2] +; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] +; AVX512BW-NEXT: cmovnel %ecx, %eax ## encoding: [0x0f,0x45,0xc1] +; AVX512BW-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1] ; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; AVX512BW-NEXT: kmovd %ecx, %k1 ## encoding: [0xc5,0xfb,0x92,0xc9] -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] -; AVX512BW-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] -; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ## encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0] -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] +; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x01] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] ; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] -; AVX512BW-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512BW-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: half_vec_compare: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] -; SKX-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; SKX-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] +; SKX-NEXT: ## xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vpsrld $16, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x10] +; SKX-NEXT: vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00] +; SKX-NEXT: movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0] +; SKX-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] +; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9] +; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2] +; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] +; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] +; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; SKX-NEXT: testb %cl, %cl ## encoding: [0x84,0xc9] +; SKX-NEXT: setne %al ## encoding: [0x0f,0x95,0xc0] +; SKX-NEXT: vpextrw $0, %xmm0, %ecx ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc8,0x00] +; SKX-NEXT: movzwl %cx, %ecx ## encoding: [0x0f,0xb7,0xc9] ; SKX-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] ; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] ; SKX-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] ; SKX-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] -; SKX-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2] -; SKX-NEXT: kshiftlb $1, %k0, %k0 ## encoding: [0xc4,0xe3,0x79,0x32,0xc0,0x01] -; SKX-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; SKX-NEXT: testb %dl, %dl ## encoding: [0x84,0xd2] ; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; SKX-NEXT: kmovd %ecx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc1] +; SKX-NEXT: movw $-3, %cx ## encoding: [0x66,0xb9,0xfd,0xff] ; SKX-NEXT: kmovd %ecx, %k1 ## encoding: [0xc5,0xfb,0x92,0xc9] -; SKX-NEXT: kshiftlb $7, %k1, %k1 ## encoding: [0xc4,0xe3,0x79,0x32,0xc9,0x07] -; SKX-NEXT: kshiftrb $7, %k1, %k1 ## encoding: [0xc4,0xe3,0x79,0x30,0xc9,0x07] -; SKX-NEXT: korw %k0, %k1, %k0 ## encoding: [0xc5,0xf4,0x45,0xc0] -; SKX-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0] -; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0] -; SKX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; SKX-NEXT: kandw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x41,0xc1] +; SKX-NEXT: kmovd %eax, %k1 ## encoding: [0xc5,0xfb,0x92,0xc8] +; SKX-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] +; SKX-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9] +; SKX-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; SKX-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] ; SKX-NEXT: retq ## encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -963,13 +963,13 @@ define <8 x half> @load8f16(<8 x half>* %a) { ; X64-LABEL: load8f16: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: movaps (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: load8f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %xmm0 +; X86-NEXT: movaps (%eax), %xmm0 ; X86-NEXT: retl %res = load <8 x half>, <8 x half>* %a ret <8 x half> %res @@ -1016,13 +1016,13 @@ define <8 x half> @loadu8f16(<8 x half>* %a) { ; X64-LABEL: loadu8f16: ; X64: # %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: movups (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: loadu8f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovups (%eax), %xmm0 +; X86-NEXT: movups (%eax), %xmm0 ; X86-NEXT: retl %res = load <8 x half>, <8 x half>* %a, align 8 ret <8 x half> %res @@ -1070,12 +1070,12 @@ ; X64-LABEL: store8f16: ; X64: # %bb.0: ; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax -; X64-NEXT: vmovaps %xmm0, (%rax) +; X64-NEXT: movaps %xmm0, (%rax) ; X64-NEXT: retq ; ; X86-LABEL: store8f16: ; X86: # %bb.0: -; X86-NEXT: vmovaps %xmm0, g8f16 +; X86-NEXT: movaps %xmm0, g8f16 ; X86-NEXT: retl store <8 x half> %a, <8 x half>* @g8f16 ret void @@ -1085,12 +1085,12 @@ ; X64-LABEL: storeu8f16: ; X64: # %bb.0: ; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax -; X64-NEXT: vmovups %xmm0, (%rax) +; X64-NEXT: movups %xmm0, (%rax) ; X64-NEXT: retq ; ; X86-LABEL: storeu8f16: ; X86: # %bb.0: -; X86-NEXT: vmovups %xmm0, g8f16u +; X86-NEXT: movups %xmm0, g8f16u ; X86-NEXT: retl store <8 x half> %a, <8 x half>* @g8f16u, align 8 ret void diff --git a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll --- a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll @@ -78,7 +78,7 @@ ; ; CHECK-LABEL: test_max_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: movaps (%rdi), %xmm1 ; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = load <8 x half>, <8 x half>* %a_ptr @@ -95,7 +95,7 @@ ; ; CHECK-LABEL: test_min_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: movaps (%rdi), %xmm1 ; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = load <8 x half>, <8 x half>* %a_ptr diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -397,59 +397,61 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm3 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm6, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -503,10 +505,15 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: psrlq $48, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -517,8 +524,7 @@ ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -528,11 +534,10 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -542,9 +547,8 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -554,7 +558,7 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] @@ -602,59 +606,61 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm3 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm6, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -855,36 +861,30 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 @@ -893,37 +893,36 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -939,36 +938,30 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 @@ -977,57 +970,56 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm4, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm1 -; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: por %xmm1, %xmm4 -; CHECK-NEXT: pslld $16, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 +; CHECK-NEXT: pand %xmm0, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; CHECK-NEXT: pxor %xmm4, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm4 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pslld $16, %xmm3 +; CHECK-NEXT: psrad $16, %xmm3 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm4, %xmm0 -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1041,66 +1033,59 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 @@ -1132,7 +1117,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm3, %xmm0 -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -1466,16 +1451,16 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF @@ -1498,10 +1483,10 @@ ; CHECK-NEXT: cmpq %rax, %r8 ; CHECK-NEXT: sbbq %rdx, %rbx ; CHECK-NEXT: cmovgeq %r8, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movq %rsi, %xmm0 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -1525,27 +1510,27 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: testq %r14, %r14 ; CHECK-NEXT: cmovneq %rcx, %rbx -; CHECK-NEXT: movq %rbx, %xmm1 -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm0 +; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -1567,16 +1552,16 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx @@ -1596,10 +1581,10 @@ ; CHECK-NEXT: movl $0, %esi ; CHECK-NEXT: sbbq %rdx, %rsi ; CHECK-NEXT: cmovgeq %rcx, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movq %rbx, %xmm0 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -2004,90 +1989,92 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm1 -; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm3, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm5 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm7, %xmm2 -; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm3 -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] ; CHECK-NEXT: movdqa %xmm2, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm5 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm5, %xmm4 ; CHECK-NEXT: pandn %xmm2, %xmm5 -; CHECK-NEXT: por %xmm3, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pand %xmm6, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 @@ -2108,10 +2095,15 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: psrlq $48, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2122,8 +2114,7 @@ ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2133,11 +2124,10 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2147,9 +2137,8 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2159,7 +2148,7 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm2 @@ -2206,87 +2195,89 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm1 -; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm3, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm5 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm7, %xmm2 -; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm3 -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pandn %xmm3, %xmm4 ; CHECK-NEXT: por %xmm2, %xmm4 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2447,36 +2438,30 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 @@ -2485,37 +2470,36 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2529,36 +2513,30 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 @@ -2567,57 +2545,56 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm4, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm1 -; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: por %xmm1, %xmm4 -; CHECK-NEXT: pslld $16, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 +; CHECK-NEXT: pand %xmm0, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; CHECK-NEXT: pxor %xmm4, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm4 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pslld $16, %xmm3 +; CHECK-NEXT: psrad $16, %xmm3 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm4, %xmm0 -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -2630,66 +2607,59 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 144 -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 @@ -2721,7 +2691,7 @@ ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm3, %xmm0 -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: @@ -3069,16 +3039,16 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmpq %rcx, %rax @@ -3111,10 +3081,10 @@ ; CHECK-NEXT: cmovbeq %rbx, %rax ; CHECK-NEXT: cmpq $-1, %rdx ; CHECK-NEXT: cmovneq %rsi, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -3136,17 +3106,17 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixunshfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx @@ -3157,10 +3127,10 @@ ; CHECK-NEXT: cmovneq %rcx, %rbx ; CHECK-NEXT: cmpq $1, %r14 ; CHECK-NEXT: cmoveq %rcx, %rbx -; CHECK-NEXT: movq %rbx, %xmm1 -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm0 +; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -3181,16 +3151,16 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx @@ -3209,10 +3179,10 @@ ; CHECK-NEXT: cmovsq %rcx, %rbx ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: cmovsq %rcx, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movq %rbx, %xmm0 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -543,15 +543,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -565,8 +559,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -579,8 +573,8 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -591,8 +585,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -607,8 +601,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -619,8 +613,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -633,8 +627,7 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -645,8 +638,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -655,14 +648,14 @@ ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq @@ -678,15 +671,9 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $48, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -699,8 +686,8 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %r15d, %ebp ; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -711,19 +698,18 @@ ; CHECK-NEXT: cmovpl %r15d, %eax ; CHECK-NEXT: movzbl %al, %ebx ; CHECK-NEXT: orl %ebp, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: cmovbl %r14d, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %ebp +; CHECK-NEXT: cmoval %r12d, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %ebp -; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: cmovpl %r15d, %eax +; CHECK-NEXT: movzbl %al, %ebp +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -732,13 +718,13 @@ ; CHECK-NEXT: cmoval %r12d, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %r15d, %eax -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -748,8 +734,8 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -763,8 +749,8 @@ ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -774,8 +760,8 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -788,7 +774,7 @@ ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $48, %rsp +; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r14 @@ -805,15 +791,9 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $128, %rsp -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -827,8 +807,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -841,8 +821,8 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -853,8 +833,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -869,8 +849,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -881,8 +861,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -895,8 +875,7 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -907,8 +886,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -917,14 +896,14 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $128, %rsp +; CHECK-NEXT: addq $64, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %rbp @@ -939,15 +918,9 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $128, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -961,8 +934,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -975,8 +948,7 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -987,8 +959,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -998,13 +970,13 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1015,8 +987,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1029,8 +1001,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1041,8 +1013,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1057,7 +1029,7 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: addq $128, %rsp +; CHECK-NEXT: addq $64, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %rbp @@ -1072,15 +1044,8 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $128, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: subq $80, %rsp +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1094,8 +1059,8 @@ ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1105,11 +1070,11 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1120,8 +1085,8 @@ ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1134,8 +1099,8 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1146,8 +1111,8 @@ ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1160,8 +1125,8 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1172,8 +1137,8 @@ ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1188,7 +1153,7 @@ ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; CHECK-NEXT: addq $128, %rsp +; CHECK-NEXT: addq $80, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 @@ -1206,17 +1171,12 @@ ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: subq $104, %rsp +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload @@ -1238,8 +1198,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1257,10 +1217,10 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero @@ -1276,8 +1236,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1296,10 +1256,10 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movq %rdx, %rbp ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload @@ -1314,8 +1274,8 @@ ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rbp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1334,10 +1294,10 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %r14 ; CHECK-NEXT: cmovpq %r12, %r15 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movq %rax, %r12 ; CHECK-NEXT: movq %rdx, %r13 @@ -1356,8 +1316,7 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rax, %r12 ; CHECK-NEXT: cmovpq %rax, %r13 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT @@ -1377,13 +1336,17 @@ ; CHECK-NEXT: cmovpq %rsi, %rax ; CHECK-NEXT: movl $0, %ecx ; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, 120(%rbx) -; CHECK-NEXT: movq %rax, 112(%rbx) -; CHECK-NEXT: movq %r13, 104(%rbx) -; CHECK-NEXT: movq %r12, 96(%rbx) -; CHECK-NEXT: movq %r15, 88(%rbx) -; CHECK-NEXT: movq %r14, 80(%rbx) -; CHECK-NEXT: movq %rbp, 72(%rbx) +; CHECK-NEXT: movq %rdx, 8(%rbx) +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movq %r13, 120(%rbx) +; CHECK-NEXT: movq %r12, 112(%rbx) +; CHECK-NEXT: movq %r15, 104(%rbx) +; CHECK-NEXT: movq %r14, 96(%rbx) +; CHECK-NEXT: movq %rbp, 88(%rbx) +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movq %rax, 80(%rbx) +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movq %rax, 72(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 64(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -1398,12 +1361,8 @@ ; CHECK-NEXT: movq %rax, 24(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 16(%rbx) -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rax, 8(%rbx) -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rax, (%rbx) ; CHECK-NEXT: movq %rbx, %rax -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -542,15 +542,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: xorl %ebx, %ebx @@ -562,8 +556,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -574,8 +568,8 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -584,8 +578,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -598,8 +592,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -608,8 +602,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -620,8 +614,7 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -630,22 +623,22 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq @@ -660,15 +653,9 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebp ; CHECK-NEXT: xorl %r14d, %r14d @@ -679,8 +666,8 @@ ; CHECK-NEXT: movl $255, %r15d ; CHECK-NEXT: cmoval %r15d, %ebp ; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -689,30 +676,29 @@ ; CHECK-NEXT: cmoval %r15d, %eax ; CHECK-NEXT: movzbl %al, %ebx ; CHECK-NEXT: orl %ebp, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: cmovbl %r14d, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %ebp -; CHECK-NEXT: shll $8, %ebp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: movzbl %al, %ebp +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %r14d, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %r15d, %eax -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -720,8 +706,8 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -733,8 +719,8 @@ ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %ebx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -742,8 +728,8 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -754,7 +740,7 @@ ; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 @@ -769,15 +755,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: xorl %ebx, %ebx @@ -789,8 +769,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -801,8 +781,8 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -811,8 +791,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -825,8 +805,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -835,8 +815,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -847,8 +827,7 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -857,22 +836,22 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq @@ -885,15 +864,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: xorl %ebx, %ebx @@ -905,8 +878,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -917,8 +890,7 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -927,8 +899,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -936,13 +908,13 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -951,8 +923,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -963,8 +935,8 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -973,8 +945,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -987,7 +959,7 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq @@ -1000,15 +972,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $136, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1027,10 +992,10 @@ ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1043,13 +1008,13 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1063,8 +1028,8 @@ ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1082,10 +1047,10 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1099,8 +1064,8 @@ ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1118,10 +1083,10 @@ ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1135,8 +1100,8 @@ ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1156,7 +1121,7 @@ ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; CHECK-NEXT: addq $136, %rsp +; CHECK-NEXT: addq $88, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq @@ -1173,20 +1138,15 @@ ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: subq $104, %rsp +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: xorl %r12d, %r12d -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm0, %xmm1 @@ -1198,8 +1158,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1213,10 +1173,10 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero @@ -1228,8 +1188,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1243,10 +1203,10 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rdx, %rbp ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload @@ -1258,8 +1218,8 @@ ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rbp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1273,10 +1233,10 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r13, %r14 ; CHECK-NEXT: cmovaq %r13, %r15 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %r12 ; CHECK-NEXT: movq %rdx, %r13 @@ -1290,8 +1250,7 @@ ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r12 ; CHECK-NEXT: cmovaq %rax, %r13 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT @@ -1305,13 +1264,17 @@ ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rdx, 120(%rbx) -; CHECK-NEXT: movq %rax, 112(%rbx) -; CHECK-NEXT: movq %r13, 104(%rbx) -; CHECK-NEXT: movq %r12, 96(%rbx) -; CHECK-NEXT: movq %r15, 88(%rbx) -; CHECK-NEXT: movq %r14, 80(%rbx) -; CHECK-NEXT: movq %rbp, 72(%rbx) +; CHECK-NEXT: movq %rdx, 8(%rbx) +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movq %r13, 120(%rbx) +; CHECK-NEXT: movq %r12, 112(%rbx) +; CHECK-NEXT: movq %r15, 104(%rbx) +; CHECK-NEXT: movq %r14, 96(%rbx) +; CHECK-NEXT: movq %rbp, 88(%rbx) +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movq %rax, 80(%rbx) +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movq %rax, 72(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 64(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -1326,12 +1289,8 @@ ; CHECK-NEXT: movq %rax, 24(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rax, 16(%rbx) -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rax, 8(%rbx) -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rax, (%rbx) ; CHECK-NEXT: movq %rbx, %rax -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -495,288 +495,206 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> *%p3) nounwind { ; CHECK-LABEL: frem_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $1032, %rsp # imm = 0x408 +; CHECK-NEXT: subq $176, %rsp ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -786,76 +704,104 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -866,205 +812,140 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %esi -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %edi -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %edx -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r11d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %ebp -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r14d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r15d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r12d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r13d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r8d -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r9d -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload -; CHECK-NEXT: movw %r10w, 62(%rbx) -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movw %ax, 60(%rbx) -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movw %ax, 58(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r10d -; CHECK-NEXT: movw %si, 56(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %esi -; CHECK-NEXT: movw %di, 54(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %edi -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movw %ax, 52(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %cx, 50(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %ecx -; CHECK-NEXT: movw %dx, 48(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %edx -; CHECK-NEXT: movw %r11w, 46(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r11d -; CHECK-NEXT: movw %bp, 44(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %ebp -; CHECK-NEXT: movw %r14w, 42(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r14d -; CHECK-NEXT: movw %r15w, 40(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r15d -; CHECK-NEXT: movw %r12w, 38(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r12d -; CHECK-NEXT: movw %r13w, 36(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r13d -; CHECK-NEXT: movw %r8w, 34(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r8d -; CHECK-NEXT: movw %r9w, 32(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r9d -; CHECK-NEXT: movw %r10w, 30(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %r10d -; CHECK-NEXT: movw %si, 28(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %esi -; CHECK-NEXT: movw %di, 26(%rbx) -; CHECK-NEXT: movw %ax, 24(%rbx) -; CHECK-NEXT: movw %cx, 22(%rbx) -; CHECK-NEXT: movw %dx, 20(%rbx) -; CHECK-NEXT: movw %r11w, 18(%rbx) -; CHECK-NEXT: movw %bp, 16(%rbx) -; CHECK-NEXT: movw %r14w, 14(%rbx) -; CHECK-NEXT: movw %r15w, 12(%rbx) -; CHECK-NEXT: movw %r12w, 10(%rbx) -; CHECK-NEXT: movw %r13w, 8(%rbx) -; CHECK-NEXT: movw %r8w, 6(%rbx) -; CHECK-NEXT: movw %r9w, 4(%rbx) -; CHECK-NEXT: movw %r10w, 2(%rbx) -; CHECK-NEXT: movw %si, (%rbx) -; CHECK-NEXT: addq $1032, %rsp # imm = 0x408 +; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, 48(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 32(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $176, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %frem = frem <32 x half> %a0, %a1 store <32 x half> %frem, <32 x half> *%p3 @@ -1075,217 +956,200 @@ ; CHECK-LABEL: frem_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $512, %rsp # imm = 0x200 +; CHECK-NEXT: subq $112, %rsp ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -1296,63 +1160,27 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 30(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 28(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 26(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 24(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 22(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 20(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 18(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 16(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 14(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 12(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 10(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 8(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 6(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 4(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 2(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, (%rbx) -; CHECK-NEXT: addq $512, %rsp # imm = 0x200 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $112, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %frem = frem <16 x half> %a0, %a1 @@ -1364,148 +1192,119 @@ ; CHECK-LABEL: frem_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $240, %rsp +; CHECK-NEXT: subq $80, %rsp ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movss %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm1 ; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 14(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 12(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 10(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 8(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 6(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 4(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, 2(%rbx) -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, (%rbx) -; CHECK-NEXT: addq $240, %rsp +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, (%rbx) +; CHECK-NEXT: addq $80, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %frem = frem <8 x half> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -681,25 +681,8 @@ ; ; BWON-F16C-LABEL: test_trunc64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; BWON-F16C-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovd %xmm1, %eax -; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; BWON-F16C-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm2, %ecx -; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %edx -; BWON-F16C-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %esi -; BWON-F16C-NEXT: movw %si, 4(%rdi) -; BWON-F16C-NEXT: movw %dx, (%rdi) -; BWON-F16C-NEXT: movw %cx, 6(%rdi) -; BWON-F16C-NEXT: movw %ax, 2(%rdi) +; BWON-F16C-NEXT: vcvtpd2ps %ymm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $0, %xmm0, (%rdi) ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: retq ; @@ -1156,24 +1139,15 @@ ; BWON-F16C-NEXT: movzwl (%rax), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; BWON-F16C-NEXT: vmovq %xmm1, %rax -; BWON-F16C-NEXT: movq %rax, %rcx -; BWON-F16C-NEXT: shrq $48, %rcx -; BWON-F16C-NEXT: movq %rax, %rdx -; BWON-F16C-NEXT: shrq $32, %rdx -; BWON-F16C-NEXT: movl %eax, %esi -; BWON-F16C-NEXT: shrl $16, %esi ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: xorl %eax, %eax ; BWON-F16C-NEXT: vucomiss %xmm0, %xmm0 -; BWON-F16C-NEXT: movl $32256, %edi # imm = 0x7E00 -; BWON-F16C-NEXT: cmovpl %edi, %esi -; BWON-F16C-NEXT: cmovpl %edi, %edx -; BWON-F16C-NEXT: cmovpl %edi, %ecx -; BWON-F16C-NEXT: cmovpl %edi, %eax -; BWON-F16C-NEXT: movw %ax, (%rax) -; BWON-F16C-NEXT: movw %cx, (%rax) -; BWON-F16C-NEXT: movw %dx, (%rax) -; BWON-F16C-NEXT: movw %si, (%rax) +; BWON-F16C-NEXT: movl $65535, %ecx # imm = 0xFFFF +; BWON-F16C-NEXT: cmovnpl %eax, %ecx +; BWON-F16C-NEXT: vmovd %ecx, %xmm0 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; BWON-F16C-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; BWON-F16C-NEXT: vmovq %xmm0, (%rax) ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: main.45: diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -8,9 +8,7 @@ ; X86-LABEL: ir_fadd_v1f16: ; X86: # %bb.0: ; X86-NEXT: subl $28, %esp -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: pextrw $0, %xmm0, %eax ; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __extendhfsf2 @@ -29,17 +27,16 @@ ; ; X64-LABEL: ir_fadd_v1f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: subq $40, %rsp +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill -; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload +; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __truncsfhf2@PLT -; X64-NEXT: popq %rax +; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v1f16: @@ -86,15 +83,14 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v2f16: ; X86: # %bb.0: -; X86-NEXT: subl $80, %esp -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: subl $84, %esp ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: psrld $16, %xmm0 ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: psrld $16, %xmm0 ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: pextrw $0, %xmm1, %eax ; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill @@ -121,113 +117,60 @@ ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __truncsfhf2 -; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll __truncsfhf2 -; X86-NEXT: movaps %xmm0, %xmm1 -; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload -; X86-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-NEXT: addl $80, %esp +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: addl $84, %esp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v2f16: ; X64: # %bb.0: -; X64-NEXT: subq $24, %rsp -; X64-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: subq $72, %rsp +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: psrld $16, %xmm0 +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrld $16, %xmm0 ; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: callq __extendhfsf2@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __truncsfhf2@PLT -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: callq __extendhfsf2@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: callq __extendhfsf2@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __truncsfhf2@PLT -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero -; X64-NEXT: addq $24, %rsp +; X64-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: addq $72, %rsp ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v2f16: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm1, %eax -; F16C-NEXT: vpextrw $0, %xmm3, %ecx -; F16C-NEXT: vpextrw $0, %xmm0, %edx -; F16C-NEXT: vpextrw $0, %xmm2, %esi -; F16C-NEXT: movzwl %si, %esi -; F16C-NEXT: vmovd %esi, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %dx, %edx -; F16C-NEXT: vmovd %edx, %xmm1 -; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %edx -; F16C-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; F16C-NEXT: movzwl %cx, %ecx -; F16C-NEXT: vmovd %ecx, %xmm1 -; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: movzwl %ax, %eax -; F16C-NEXT: vmovd %eax, %xmm2 -; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; F16C-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; F16C-NEXT: vmovd %xmm1, %eax -; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %ymm1 +; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 +; F16C-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; F16C-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; F16C-NEXT: vzeroupper ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v2f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: vpextrw $0, %xmm2, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movzwl %ax, %eax -; F16C-O0-NEXT: vmovd %eax, %xmm2 -; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 -; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movzwl %ax, %eax -; F16C-O0-NEXT: vmovd %eax, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-O0-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vmovd %xmm0, %eax -; F16C-O0-NEXT: movw %ax, %cx -; F16C-O0-NEXT: # implicit-def: $eax -; F16C-O0-NEXT: movw %cx, %ax -; F16C-O0-NEXT: # implicit-def: $xmm0 -; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 -; F16C-O0-NEXT: vpextrw $0, %xmm3, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movzwl %ax, %eax -; F16C-O0-NEXT: vmovd %eax, %xmm2 -; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 -; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movzwl %ax, %eax -; F16C-O0-NEXT: vmovd %eax, %xmm1 -; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-O0-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; F16C-O0-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; F16C-O0-NEXT: vmovd %xmm1, %eax -; F16C-O0-NEXT: movw %ax, %cx -; F16C-O0-NEXT: # implicit-def: $eax -; F16C-O0-NEXT: movw %cx, %ax -; F16C-O0-NEXT: # implicit-def: $xmm1 -; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm1, %xmm1 +; F16C-O0-NEXT: vcvtph2ps %xmm1, %ymm1 +; F16C-O0-NEXT: vcvtph2ps %xmm0, %ymm0 +; F16C-O0-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; F16C-O0-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; F16C-O0-NEXT: vzeroupper ; F16C-O0-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll --- a/llvm/test/CodeGen/X86/pr47000.ll +++ b/llvm/test/CodeGen/X86/pr47000.ll @@ -7,26 +7,22 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { ; CHECK-LABEL: doTheTestMod: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: subl $124, %esp -; CHECK-NEXT: # implicit-def: $xmm3 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm3 -; CHECK-NEXT: # implicit-def: $xmm2 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm2 -; CHECK-NEXT: # implicit-def: $xmm1 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1 -; CHECK-NEXT: # implicit-def: $xmm0 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: # implicit-def: $xmm4 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm4 -; CHECK-NEXT: # implicit-def: $xmm5 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm5 -; CHECK-NEXT: # implicit-def: $xmm6 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm6 -; CHECK-NEXT: # implicit-def: $xmm7 -; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm7 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: subl $140, %esp +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm6 +; CHECK-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: psrlq $48, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: movaps %xmm6, %xmm7 +; CHECK-NEXT: movaps %xmm6, %xmm4 +; CHECK-NEXT: psrlq $48, %xmm4 +; CHECK-NEXT: movaps %xmm6, %xmm5 +; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,1,1] +; CHECK-NEXT: psrld $16, %xmm6 ; CHECK-NEXT: pextrw $0, %xmm7, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) @@ -45,10 +41,10 @@ ; CHECK-NEXT: pextrw $0, %xmm2, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: pextrw $0, %xmm1, %eax +; CHECK-NEXT: pextrw $0, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: pextrw $0, %xmm1, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) ; CHECK-NEXT: # implicit-def: $xmm0 @@ -174,29 +170,18 @@ ; CHECK-NEXT: movl %esp, %eax ; CHECK-NEXT: fstps (%eax) ; CHECK-NEXT: calll __truncsfhf2 -; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Reload -; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Reload +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pextrw $0, %xmm3, %edx -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %dx, 6(%ecx) -; CHECK-NEXT: pextrw $0, %xmm2, %edx -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %dx, 4(%ecx) -; CHECK-NEXT: pextrw $0, %xmm1, %edx -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %dx, 2(%ecx) -; CHECK-NEXT: pextrw $0, %xmm0, %edx -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %dx, (%ecx) -; CHECK-NEXT: addl $124, %esp -; CHECK-NEXT: retl $4 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: addl $140, %esp +; CHECK-NEXT: retl Entry: %x = alloca <4 x half>, align 8 %y = alloca <4 x half>, align 8 diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -12,22 +12,14 @@ ; CHECK-NEXT: pinsrw $0, 2(%rsi), %xmm5 ; CHECK-NEXT: pinsrw $0, 4(%rsi), %xmm6 ; CHECK-NEXT: pinsrw $0, 6(%rsi), %xmm7 -; CHECK-NEXT: pextrw $0, %xmm7, %eax -; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: pextrw $0, %xmm3, %eax -; CHECK-NEXT: movw %ax, 12(%rdx) -; CHECK-NEXT: pextrw $0, %xmm6, %eax -; CHECK-NEXT: movw %ax, 10(%rdx) -; CHECK-NEXT: pextrw $0, %xmm2, %eax -; CHECK-NEXT: movw %ax, 8(%rdx) -; CHECK-NEXT: pextrw $0, %xmm5, %eax -; CHECK-NEXT: movw %ax, 6(%rdx) -; CHECK-NEXT: pextrw $0, %xmm1, %eax -; CHECK-NEXT: movw %ax, 4(%rdx) -; CHECK-NEXT: pextrw $0, %xmm4, %eax -; CHECK-NEXT: movw %ax, 2(%rdx) -; CHECK-NEXT: pextrw $0, %xmm0, %eax -; CHECK-NEXT: movw %ax, (%rdx) +; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: movdqa %xmm0, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2094,71 +2094,6 @@ ret <8 x i32> %cvt } -; -; Special Cases -; - -define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { -; SSE-LABEL: fptosi_2f16_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $16, %rsp -; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: callq __extendhfsf2@PLT -; SSE-NEXT: cvttss2si %xmm0, %ebx -; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: callq __extendhfsf2@PLT -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ebx, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: addq $16, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_2f16_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: pushq %rbx -; VEX-NEXT: subq $16, %rsp -; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; VEX-NEXT: callq __extendhfsf2@PLT -; VEX-NEXT: vcvttss2si %xmm0, %ebx -; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero -; VEX-NEXT: callq __extendhfsf2@PLT -; VEX-NEXT: vcvttss2si %xmm0, %eax -; VEX-NEXT: vmovd %eax, %xmm0 -; VEX-NEXT: vmovd %ebx, %xmm1 -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: addq $16, %rsp -; VEX-NEXT: popq %rbx -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f16_to_4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx -; AVX512-NEXT: movzwl %cx, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %ecx -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: retq - %cvt = fptosi <2 x half> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind { ; SSE-LABEL: fptosi_2f80_to_4i32: ; SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -661,16 +661,8 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; ALL-LABEL: cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -680,26 +672,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -710,26 +684,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -741,26 +697,8 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -772,164 +710,28 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX1-LABEL: cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: movzwl %cx, %ecx -; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %edx -; AVX1-NEXT: shll $16, %edx -; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %edx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: movzwl %dx, %edx -; AVX1-NEXT: orl %ecx, %edx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: movzwl %si, %esi -; AVX1-NEXT: orl %ecx, %esi -; AVX1-NEXT: shlq $32, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vcvtpd2ps %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2ph $0, %xmm1, %xmm1 +; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %edx -; AVX2-NEXT: shll $16, %edx -; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %edx, %eax -; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: shll $16, %ecx -; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %ecx, %edx -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: shll $16, %ecx -; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: movzwl %si, %esi -; AVX2-NEXT: orl %ecx, %esi -; AVX2-NEXT: shlq $32, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vcvtpd2ps %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2ph $0, %xmm1, %xmm1 +; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: shll $16, %eax -; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: movzwl %cx, %ecx -; AVX512-NEXT: orl %eax, %ecx -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %edx -; AVX512-NEXT: shll $16, %edx -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: orl %edx, %eax -; AVX512-NEXT: shlq $32, %rax -; AVX512-NEXT: orq %rcx, %rax -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: shll $16, %ecx -; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %edx -; AVX512-NEXT: movzwl %dx, %edx -; AVX512-NEXT: orl %ecx, %edx -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %ecx -; AVX512-NEXT: shll $16, %ecx -; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: movzwl %si, %esi -; AVX512-NEXT: orl %ecx, %esi -; AVX512-NEXT: shlq $32, %rsi -; AVX512-NEXT: orq %rdx, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> @@ -958,15 +760,9 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %ecx -; ALL-NEXT: movw %cx, (%rdi) -; ALL-NEXT: movw %ax, 2(%rdi) +; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; ALL-NEXT: vmovss %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -977,25 +773,8 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %ecx -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %edx -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %esi -; ALL-NEXT: movw %si, 4(%rdi) -; ALL-NEXT: movw %dx, (%rdi) -; ALL-NEXT: movw %cx, 6(%rdi) -; ALL-NEXT: movw %ax, 2(%rdi) +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -1007,26 +786,8 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq @@ -1040,26 +801,8 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm2 -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq @@ -1073,131 +816,30 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX1-LABEL: store_cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %r8d -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %r9d -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %r10d -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX1-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX1-NEXT: vmovd %xmm4, %r11d -; AVX1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: movw %si, 12(%rdi) -; AVX1-NEXT: movw %dx, 8(%rdi) -; AVX1-NEXT: movw %cx, 4(%rdi) -; AVX1-NEXT: movw %ax, (%rdi) -; AVX1-NEXT: movw %r11w, 14(%rdi) -; AVX1-NEXT: movw %r10w, 10(%rdi) -; AVX1-NEXT: movw %r9w, 6(%rdi) -; AVX1-NEXT: movw %r8w, 2(%rdi) +; AVX1-NEXT: vcvtpd2ps %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2ph $0, %xmm1, %xmm1 +; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovaps %xmm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r8d -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %r9d -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %r10d -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX2-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX2-NEXT: vmovd %xmm4, %r11d -; AVX2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: movw %si, 12(%rdi) -; AVX2-NEXT: movw %dx, 8(%rdi) -; AVX2-NEXT: movw %cx, 4(%rdi) -; AVX2-NEXT: movw %ax, (%rdi) -; AVX2-NEXT: movw %r11w, 14(%rdi) -; AVX2-NEXT: movw %r10w, 10(%rdi) -; AVX2-NEXT: movw %r9w, 6(%rdi) -; AVX2-NEXT: movw %r8w, 2(%rdi) +; AVX2-NEXT: vcvtpd2ps %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2ph $0, %xmm1, %xmm1 +; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %r8d -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %r9d -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vmovd %xmm3, %r10d -; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512-NEXT: vcvtsd2ss %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512-NEXT: vmovd %xmm4, %r11d -; AVX512-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %edx -; AVX512-NEXT: vcvtsd2ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %esi -; AVX512-NEXT: movw %si, 12(%rdi) -; AVX512-NEXT: movw %dx, 8(%rdi) -; AVX512-NEXT: movw %cx, 4(%rdi) -; AVX512-NEXT: movw %ax, (%rdi) -; AVX512-NEXT: movw %r11w, 14(%rdi) -; AVX512-NEXT: movw %r10w, 10(%rdi) -; AVX512-NEXT: movw %r9w, 6(%rdi) -; AVX512-NEXT: movw %r8w, 2(%rdi) +; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> @@ -1235,3 +877,27 @@ store <32 x half> %1, <32 x half>* %a1 ret void } + +define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { +; ALL-LABEL: fptosi_2f16_to_4i32: +; ALL: # %bb.0: +; ALL-NEXT: vpsrld $16, %xmm0, %xmm1 +; ALL-NEXT: vpextrw $0, %xmm1, %eax +; ALL-NEXT: movzwl %ax, %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvttss2si %xmm1, %eax +; ALL-NEXT: vpextrw $0, %xmm0, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: vmovd %ecx, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvttss2si %xmm0, %ecx +; ALL-NEXT: vmovd %ecx, %xmm0 +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; ALL-NEXT: retq + %cvt = fptosi <2 x half> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; @@ -370,10 +370,11 @@ ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %rbx ; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pextrw $0, %xmm1, %ebx -; SSE-NEXT: pextrw $0, %xmm0, %ebp -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pextrw $0, %xmm0, %ebx +; SSE-NEXT: pextrw $0, %xmm1, %ebp ; SSE-NEXT: callq __extendhfsf2@PLT ; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -391,10 +392,11 @@ ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %rbx ; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovdqa %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpextrw $0, %xmm1, %ebx -; AVX-NEXT: vpextrw $0, %xmm0, %ebp -; AVX-NEXT: vmovdqa %xmm1, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, %ebx +; AVX-NEXT: vpextrw $0, %xmm1, %ebp ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -407,20 +409,47 @@ ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2f16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax -; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm0 -; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512BW-NEXT: movzwl %ax, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 -; AVX512BW-NEXT: cmoval %eax, %ecx -; AVX512BW-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 -; AVX512BW-NEXT: retq +; AVX512F-LABEL: test_v2f16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512F-NEXT: vpextrw $0, %xmm0, %eax +; AVX512F-NEXT: movzwl %ax, %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: movzwl %ax, %eax +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: vucomiss %xmm3, %xmm2 +; AVX512F-NEXT: movl $255, %ecx +; AVX512F-NEXT: cmovbel %eax, %ecx +; AVX512F-NEXT: kmovd %ecx, %k1 +; AVX512F-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_v2f16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512VL-NEXT: vpextrw $0, %xmm0, %eax +; AVX512VL-NEXT: movzwl %ax, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $0, %xmm1, %eax +; AVX512VL-NEXT: movzwl %ax, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: vucomiss %xmm3, %xmm2 +; AVX512VL-NEXT: movl $255, %ecx +; AVX512VL-NEXT: cmovbel %eax, %ecx +; AVX512VL-NEXT: kmovd %ecx, %k1 +; AVX512VL-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512FP16-LABEL: test_v2f16: ; AVX512FP16: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; @@ -369,10 +369,11 @@ ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %rbx ; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pextrw $0, %xmm1, %ebx -; SSE-NEXT: pextrw $0, %xmm0, %ebp -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pextrw $0, %xmm0, %ebx +; SSE-NEXT: pextrw $0, %xmm1, %ebp ; SSE-NEXT: callq __extendhfsf2@PLT ; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -390,10 +391,11 @@ ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %rbx ; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovdqa %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpextrw $0, %xmm1, %ebx -; AVX-NEXT: vpextrw $0, %xmm0, %ebp -; AVX-NEXT: vmovdqa %xmm1, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, %ebx +; AVX-NEXT: vpextrw $0, %xmm1, %ebp ; AVX-NEXT: callq __extendhfsf2@PLT ; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -406,20 +408,47 @@ ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2f16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax -; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm0 -; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512BW-NEXT: movzwl %ax, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 -; AVX512BW-NEXT: cmovbl %eax, %ecx -; AVX512BW-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 -; AVX512BW-NEXT: retq +; AVX512F-LABEL: test_v2f16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512F-NEXT: vpextrw $0, %xmm0, %eax +; AVX512F-NEXT: movzwl %ax, %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: movzwl %ax, %eax +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: vucomiss %xmm3, %xmm2 +; AVX512F-NEXT: movl $255, %ecx +; AVX512F-NEXT: cmovael %eax, %ecx +; AVX512F-NEXT: kmovd %ecx, %k1 +; AVX512F-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_v2f16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512VL-NEXT: vpextrw $0, %xmm0, %eax +; AVX512VL-NEXT: movzwl %ax, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $0, %xmm1, %eax +; AVX512VL-NEXT: movzwl %ax, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: vucomiss %xmm3, %xmm2 +; AVX512VL-NEXT: movl $255, %ecx +; AVX512VL-NEXT: cmovael %eax, %ecx +; AVX512VL-NEXT: kmovd %ecx, %k1 +; AVX512VL-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: retq ; ; AVX512FP16-LABEL: test_v2f16: ; AVX512FP16: # %bb.0: